From c1795ca6c8b1351a563a3e91023e46d18d59d52b Mon Sep 17 00:00:00 2001 From: Lucas M Date: Sun, 12 Mar 2017 11:51:59 -0700 Subject: [PATCH 001/200] [streamable] Update API URL --- youtube_dl/extractor/streamable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py index e973c867c..9f5c237ef 100644 --- a/youtube_dl/extractor/streamable.py +++ b/youtube_dl/extractor/streamable.py @@ -65,7 +65,7 @@ class StreamableIE(InfoExtractor): # to return video info like the title properly sometimes, and doesn't # include info like the video duration video = self._download_json( - 'https://streamable.com/ajax/videos/%s' % video_id, video_id) + 'https://ajax.streamable.com/videos/%s' % video_id, video_id) # Format IDs: # 0 The video is being uploaded From ff9d509d200577a0be962ee47894cd257c7ef818 Mon Sep 17 00:00:00 2001 From: Vijay Singh Date: Mon, 13 Mar 2017 01:52:35 +0530 Subject: [PATCH 002/200] [openload] Fix extraction Just a minor fix for openload --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 5a5607357..9a42ab895 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -100,7 +100,7 @@ class OpenloadIE(InfoExtractor): i = int(B, 16) index = (h / 2) % 10 A = hashMap[index] - i = i ^ 137 + i = i ^ 96 i = i ^ A video_url_chars.append(compat_chr(i)) h += 2 From e313d209c25bcf8adf3c888516624f5c9b2f2eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 13 Mar 2017 22:39:15 +0700 Subject: [PATCH 003/200] [mitele] Add support for ooyala videos (closes #12430) --- youtube_dl/extractor/mitele.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 79e0b8ada..28b743cca 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import uuid from .common import InfoExtractor +from .ooyala import OoyalaIE from ..compat import ( compat_str, compat_urllib_parse_urlencode, @@ -24,6 +25,9 @@ class MiTeleBaseIE(InfoExtractor): r'(?s)()', webpage, 'ms video player')) video_id = player_data['data-media-id'] + if player_data.get('data-cms-id') == 'ooyala': + return self.url_result( + 'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id) config_url = compat_urlparse.urljoin(url, player_data['data-config']) config = self._download_json( config_url, video_id, 'Downloading config JSON') From 9d089630229e9d921da255fc6d3f671d307a0848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 13 Mar 2017 22:41:28 +0700 Subject: [PATCH 004/200] [telecinco] Add test for #12430 --- youtube_dl/extractor/telecinco.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index d5abfc9e4..fdcc7d573 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -44,6 +44,10 @@ class TelecincoIE(MiTeleBaseIE): }, { 'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html', 'only_matching': True, + }, { + # ooyala video + 'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html', + 'only_matching': True, }] def _real_extract(self, url): From 66bf351f8052fb71dce20c3a5ba1aa507532222e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 14 Mar 2017 00:37:39 +0700 Subject: [PATCH 005/200] [facebook] Make title optional (closes #12443) --- youtube_dl/extractor/facebook.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 6315d40c5..b69c1ede0 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -196,6 +196,10 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', 'only_matching': True, + }, { + # no title + 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', + 'only_matching': True, }] @staticmethod @@ -353,15 +357,15 @@ class FacebookIE(InfoExtractor): self._sort_formats(formats) video_title = self._html_search_regex( - r']*class="uiHeaderTitle"[^>]*>([^<]*)', webpage, 'title', - default=None) + r']*class="uiHeaderTitle"[^>]*>([^<]*)', webpage, + 'title', default=None) if not video_title: video_title = self._html_search_regex( r'(?s)(.*?)', webpage, 'alternative title', default=None) if not video_title: video_title = self._html_search_meta( - 'description', webpage, 'title') + 'description', webpage, 'title', default=None) if video_title: video_title = limit_length(video_title, 80) else: From 398887b4c09b3691379720314f4918bc094d1b7b Mon Sep 17 00:00:00 2001 From: Vijay Singh Date: Tue, 14 Mar 2017 05:19:18 +0530 Subject: [PATCH 006/200] [Openload] Fixed Extraction They did changed it again. --- youtube_dl/extractor/openload.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 9a42ab895..5ea749f35 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -96,14 +96,16 @@ class OpenloadIE(InfoExtractor): h = 0 while h < len(v): - B = v[h:h + 2] + B = v[h:h + 3] i = int(B, 16) - index = (h / 2) % 10 + if (h / 3) % 3 == 0: + i = int(B, 8) + index = (h / 3) % 10 A = hashMap[index] - i = i ^ 96 + i = i ^ 47 i = i ^ A video_url_chars.append(compat_chr(i)) - h += 2 + h += 3 video_url = 'https://openload.co/stream/%s?mime=true' video_url = video_url % (''.join(video_url_chars)) From 2a751e137ffecf616f04f036bd89c87e967647bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 15 Mar 2017 01:58:59 +0700 Subject: [PATCH 007/200] [ChangeLog] Actualize --- ChangeLog | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ChangeLog b/ChangeLog index b1425e630..eaf1f7dbd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -3,6 +3,15 @@ version Core * Fix missing subtitles if --add-metadata is used (#12423) +Extractors +* [facebook] Make title optional (#12443) ++ [mitele] Add support for ooyala videos (#12430) +* [openload] Fix extraction (#12435, #12446) +* [streamable] Update API URL (#12433) ++ [crunchyroll] Extract season name (#12428) +* [discoverygo] Bypass geo restriction ++ [discoverygo:playlist] Add support for playlists (#12424) + version 2017.03.10 From 5db83d79bfe192f8a7c80dd44fb2089f114a4189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 15 Mar 2017 02:01:24 +0700 Subject: [PATCH 008/200] release 2017.03.15 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 76e09c42a..cd8592775 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.10*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.15*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.15** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.03.10 +[debug] youtube-dl version 2017.03.15 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index eaf1f7dbd..e10519792 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.03.15 Core * Fix missing subtitles if --add-metadata is used (#12423) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 09dc830cb..cc0309f97 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -208,6 +208,7 @@ - **Digiteka** - **Discovery** - **DiscoveryGo** + - **DiscoveryGoPlaylist** - **Disney** - **Dotsub** - **DouyuTV**: 斗鱼 diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d74046b37..cbe686517 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.03.10' +__version__ = '2017.03.15' From ba448445b8baa66de92c65793f7ecba8927f0ce8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 15 Mar 2017 01:40:54 +0100 Subject: [PATCH 009/200] [redbull] improve extraction - extract 1080p quality - correct ttml subtitle ext - catch api errors - reduce request size --- youtube_dl/extractor/redbulltv.py | 62 +++++++++++++++++++------------ 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index 5c73d5bca..afab62426 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -2,11 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( float_or_none, int_or_none, try_get, - unified_timestamp, + # unified_timestamp, + ExtractorError, ) @@ -15,15 +17,15 @@ class RedBullTVIE(InfoExtractor): _TESTS = [{ # film 'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc', - 'md5': '78e860f631d7a846e712fab8c5fe2c38', + 'md5': 'fb0445b98aa4394e504b413d98031d1f', 'info_dict': { 'id': 'AP-1Q756YYX51W11', 'ext': 'mp4', 'title': 'ABC of...WRC', 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', 'duration': 1582.04, - 'timestamp': 1488405786, - 'upload_date': '20170301', + # 'timestamp': 1488405786, + # 'upload_date': '20170301', }, }, { # episode @@ -34,8 +36,8 @@ class RedBullTVIE(InfoExtractor): 'title': 'Grime - Hashtags S2 E4', 'description': 'md5:334b741c8c1ce65be057eab6773c1cf5', 'duration': 904.6, - 'timestamp': 1487290093, - 'upload_date': '20170217', + # 'timestamp': 1487290093, + # 'upload_date': '20170217', 'series': 'Hashtags', 'season_number': 2, 'episode_number': 4, @@ -48,29 +50,40 @@ class RedBullTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - access_token = self._download_json( - 'https://api-v2.redbull.tv/start', video_id, + session = self._download_json( + 'https://api-v2.redbull.tv/session', video_id, note='Downloading access token', query={ - 'build': '4.0.9', - 'category': 'smartphone', - 'os_version': 23, - 'os_family': 'android', - })['auth']['access_token'] + 'build': '4.370.0', + 'category': 'personal_computer', + 'os_version': '1.0', + 'os_family': 'http', + }) + if session.get('code') == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, session['message'])) + auth = '%s %s' % (session.get('token_type', 'Bearer'), session['access_token']) - info = self._download_json( - 'https://api-v2.redbull.tv/views/%s' % video_id, - video_id, note='Downloading video information', - headers={'Authorization': 'Bearer ' + access_token} - )['blocks'][0]['top'][0] + try: + info = self._download_json( + 'https://api-v2.redbull.tv/content/%s' % video_id, + video_id, note='Downloading video information', + headers={'Authorization': auth} + ) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + error_message = self._parse_json( + e.cause.read().decode(), video_id)['message'] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error_message), expected=True) + raise video = info['video_product'] title = info['title'].strip() - m3u8_url = video['url'] formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + video['url'], video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) subtitles = {} for _, captions in (try_get( @@ -82,9 +95,12 @@ class RedBullTVIE(InfoExtractor): caption_url = caption.get('url') if not caption_url: continue + ext = caption.get('format') + if ext == 'xml': + ext = 'ttml' subtitles.setdefault(caption.get('lang') or 'en', []).append({ 'url': caption_url, - 'ext': caption.get('format'), + 'ext': ext, }) subheading = info.get('subheading') @@ -97,7 +113,7 @@ class RedBullTVIE(InfoExtractor): 'description': info.get('long_description') or info.get( 'short_description'), 'duration': float_or_none(video.get('duration'), scale=1000), - 'timestamp': unified_timestamp(info.get('published')), + # 'timestamp': unified_timestamp(info.get('published')), 'series': info.get('show_title'), 'season_number': int_or_none(info.get('season_number')), 'episode_number': int_or_none(info.get('episode_number')), From a3096842856d0471b435fb0a85b295da7c4bcf7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Mar 2017 03:28:01 +0700 Subject: [PATCH 010/200] [extractor/generic] Add forgotten return for jwplayer formats --- youtube_dl/extractor/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ad47c3b6b..0fcb3fdac 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2554,6 +2554,7 @@ class GenericIE(InfoExtractor): jwplayer_data, video_id, require_title=False) if not info.get('title'): info['title'] = video_title + return info except ExtractorError: pass From b51dc9db0e6ffc6a7725d92fa2c5de45a5b1be20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Mar 2017 03:30:53 +0700 Subject: [PATCH 011/200] [extractor/common] Extract SMIL formats from jwplayer --- youtube_dl/extractor/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78dc5be24..b51799bfa 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2247,6 +2247,9 @@ class InfoExtractor(object): elif ext == 'mpd': formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + source_url, video_id, fatal=False)) # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 elif source_type.startswith('audio') or ext in ( 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): From 21bfcd3d6e41aed6113c874533fcfe41eb250d96 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 16 Mar 2017 12:50:45 +0100 Subject: [PATCH 012/200] [postprocessor/ffmpeg] Add support for flac Requested at http://stackoverflow.com/q/42828041/35070 --- youtube_dl/__init__.py | 2 +- youtube_dl/postprocessor/ffmpeg.py | 34 +++++++++++++++++++----------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c482f9375..2f640607f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -196,7 +196,7 @@ def _real_main(argv=None): if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: raise ValueError('Playlist end must be greater than playlist start') if opts.extractaudio: - if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: + if opts.audioformat not in ['best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: parser.error('invalid audio format specified') if opts.audioquality: opts.audioquality = opts.audioquality.strip('k').strip('K') diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 96ddb3b36..7c162d92a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -26,15 +26,25 @@ from ..utils import ( EXT_TO_OUT_FORMATS = { - "aac": "adts", - "m4a": "ipod", - "mka": "matroska", - "mkv": "matroska", - "mpg": "mpeg", - "ogv": "ogg", - "ts": "mpegts", - "wma": "asf", - "wmv": "asf", + 'aac': 'adts', + 'flac': 'flac', + 'm4a': 'ipod', + 'mka': 'matroska', + 'mkv': 'matroska', + 'mpg': 'mpeg', + 'ogv': 'ogg', + 'ts': 'mpegts', + 'wma': 'asf', + 'wmv': 'asf', +} +ACODECS = { + 'mp3': 'libmp3lame', + 'aac': 'aac', + 'flac': 'flac', + 'm4a': 'aac', + 'opus': 'opus', + 'vorbis': 'libvorbis', + 'wav': None, } @@ -237,7 +247,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): acodec = 'copy' extension = 'm4a' more_opts = ['-bsf:a', 'aac_adtstoasc'] - elif filecodec in ['aac', 'mp3', 'vorbis', 'opus']: + elif filecodec in ['aac', 'flac', 'mp3', 'vorbis', 'opus']: # Lossless if possible acodec = 'copy' extension = filecodec @@ -256,8 +266,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): else: more_opts += ['-b:a', self._preferredquality + 'k'] else: - # We convert the audio (lossy) - acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'opus': 'opus', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec] + # We convert the audio (lossy if codec is lossy) + acodec = ACODECS[self._preferredcodec] extension = self._preferredcodec more_opts = [] if self._preferredquality is not None: From 0efbc6b56d2b030e5dc98fa7f533a2e6cd41cf30 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 16 Mar 2017 12:54:47 +0100 Subject: [PATCH 013/200] [options] Mention flac support and sort alphabetically among the audio formats --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 8b51d3c6f..6b811535f 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -773,7 +773,7 @@ def parseOpts(overrideArguments=None): help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', - help='Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "%default" by default; No effect without -x') + help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', From 6ad476079db0dd806877cac1b73232d0ae16d50f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Mar 2017 22:39:48 +0700 Subject: [PATCH 014/200] [ChangeLog] Actualize --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index e10519792..da64f97ea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +version + +Core ++ [postprocessor/ffmpeg] Add support for flac ++ [extractor/common] Extract SMIL formats from jwplayer + +Extractors ++ [generic] Add forgotten return for jwplayer formats +* [redbulltv] Improve extraction + + version 2017.03.15 Core From 7d539ee10a8b0aeefb408ece19ce543f363006bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 16 Mar 2017 22:42:12 +0700 Subject: [PATCH 015/200] release 2017.03.16 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 5 +++-- youtube_dl/version.py | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index cd8592775..0e94b6cde 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.15*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.15** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.16*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.16** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.03.15 +[debug] youtube-dl version 2017.03.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index da64f97ea..75a8bd7a6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.03.16 Core + [postprocessor/ffmpeg] Add support for flac diff --git a/README.md b/README.md index 0fc5984dc..86b44781c 100644 --- a/README.md +++ b/README.md @@ -375,8 +375,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo (requires ffmpeg or avconv and ffprobe or avprobe) --audio-format FORMAT Specify audio format: "best", "aac", - "vorbis", "mp3", "m4a", "opus", or "wav"; - "best" by default; No effect without -x + "flac", "mp3", "m4a", "opus", "vorbis", or + "wav"; "best" by default; No effect without + -x --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cbe686517..f38f130bf 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.03.15' +__version__ = '2017.03.16' From 7f3590c43b8ae5cdba3c63e35e786083e3589485 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Mar 2017 00:00:01 +0700 Subject: [PATCH 016/200] [test_InfoExtractor] Add some realworld tests for _extract_jwplayer_data --- test/test_InfoExtractor.py | 93 +++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 437c7270e..881197afb 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -8,7 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL +from test.helper import FakeYDL, expect_dict from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError @@ -84,6 +84,97 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(ExtractorError, self.ie._download_json, uri, None) self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) + def test_extract_jwplayer_data_realworld(self): + # from http://www.suffolk.edu/sjc/ + expect_dict( + self, + self.ie._extract_jwplayer_data(r''' + + ''', None, require_title=False), + { + 'id': 'XEgvuql4', + 'formats': [{ + 'url': 'rtmp://192.138.214.154/live/sjclive', + 'ext': 'flv' + }] + }) + + # from https://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary/ + expect_dict( + self, + self.ie._extract_jwplayer_data(r''' + + ''', 'dummy', require_title=False), + { + 'thumbnail': 'https://t03.vipstreamservice.com/thumbs/pxo-full/2009-12/14/a4b2157147afe5efa93ce1978e0265289c193874e02597.flv-full-13.jpg', + 'formats': [{ + 'url': 'https://cdn.pornoxo.com/key=MF+oEbaxqTKb50P-w9G3nA,end=1489689259,ip=104.199.146.27/ip=104.199.146.27/speed=6573765/buffer=3.0/2009-12/4b2157147afe5efa93ce1978e0265289c193874e02597.flv', + 'ext': 'flv' + }] + }) + + # from http://www.indiedb.com/games/king-machine/videos + expect_dict( + self, + self.ie._extract_jwplayer_data(r''' + + ''', 'dummy'), + { + 'title': 'king machine trailer 1', + 'thumbnail': 'http://media.indiedb.com/cache/images/games/1/50/49678/thumb_620x2000/king-machine-trailer.mp4.jpg', + 'formats': [{ + 'url': 'http://cdn.dbolical.com/cache/videos/games/1/50/49678/encode_mp4/king-machine-trailer.mp4', + 'height': 360, + 'ext': 'mp4' + }, { + 'url': 'http://cdn.dbolical.com/cache/videos/games/1/50/49678/encode720p_mp4/king-machine-trailer.mp4', + 'height': 720, + 'ext': 'mp4' + }] + }) + if __name__ == '__main__': unittest.main() From ea883a687c054692fcfe3cea15a22269044b64bb Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 17 Mar 2017 15:20:12 +0800 Subject: [PATCH 017/200] [openload] Fix extraction (closes #10408) Thanks to @makgun02 Ref: http://pastebin.com/raw/JX9gHFUz --- ChangeLog | 6 +++++ youtube_dl/extractor/openload.py | 43 ++++++++++++++++++++------------ 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 75a8bd7a6..eeb5813c5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors +* [openload] Fix extraction (#10408) + + version 2017.03.16 Core diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 5ea749f35..fa876b127 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -78,34 +78,45 @@ class OpenloadIE(InfoExtractor): video_url_chars = [] first_char = ord(ol_id[0]) - key = first_char - 50 + key = first_char - 55 maxKey = max(2, key) - key = min(maxKey, len(ol_id) - 22) - t = ol_id[key:key + 20] + key = min(maxKey, len(ol_id) - 26) + t = ol_id[key:key + 24] hashMap = {} - v = ol_id.replace(t, "") + v = ol_id.replace(t, '') h = 0 while h < len(t): - f = t[h:h + 2] - i = int(f, 16) - hashMap[h / 2] = i - h += 2 + f = t[h:h + 3] + i = int(f, 8) + hashMap[h / 3] = i + h += 3 h = 0 - + H = 0 while h < len(v): - B = v[h:h + 3] + B = '' + C = '' + if len(v) >= h + 2: + B = v[h:h + 2] + if len(v) >= h + 3: + C = v[h:h + 3] i = int(B, 16) - if (h / 3) % 3 == 0: - i = int(B, 8) - index = (h / 3) % 10 + h += 2 + if H % 3 == 0: + i = int(C, 8) + h += 1 + elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60: + i = int(C, 10) + h += 1 + index = H % 8 + A = hashMap[index] - i = i ^ 47 - i = i ^ A + i ^= 213 + i ^= A video_url_chars.append(compat_chr(i)) - h += 3 + H += 1 video_url = 'https://openload.co/stream/%s?mime=true' video_url = video_url % (''.join(video_url_chars)) From 3e5856d860bb94b4dbe2fa38d9c50a6a92bb7401 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 17 Mar 2017 09:53:44 +0100 Subject: [PATCH 018/200] [discoverynetworks] add support for more domains and bypass geo restiction --- .../{tlc.py => discoverynetworks.py} | 23 +++++++++++++------ youtube_dl/extractor/extractors.py | 2 +- 2 files changed, 17 insertions(+), 8 deletions(-) rename youtube_dl/extractor/{tlc.py => discoverynetworks.py} (64%) diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/discoverynetworks.py similarity index 64% rename from youtube_dl/extractor/tlc.py rename to youtube_dl/extractor/discoverynetworks.py index fd145ba42..b6653784c 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -9,13 +9,13 @@ from ..compat import ( compat_parse_qs, compat_urlparse, ) +from ..utils import smuggle_url -class TlcDeIE(InfoExtractor): - IE_NAME = 'tlc.de' - _VALID_URL = r'https?://(?:www\.)?tlc\.de/(?:[^/]+/)*videos/(?P[^/?#]+)?(?:.*#(?P<id>\d+))?' +class DiscoveryNetworksDeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P<id>\d+)|(?:[^/]+/)*videos/(?P<title>[^/?#]+))' - _TEST = { + _TESTS = [{ 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', 'info_dict': { 'id': '3235167922001', @@ -29,7 +29,13 @@ class TlcDeIE(InfoExtractor): 'upload_date': '20140404', 'uploader_id': '1659832546', }, - } + }, { + 'url': 'http://www.dmax.de/programme/storage-hunters-uk/videos/storage-hunters-uk-episode-6/', + 'only_matching': True, + }, { + 'url': 'http://www.discovery.de/#5332316765001', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s' def _real_extract(self, url): @@ -39,5 +45,8 @@ class TlcDeIE(InfoExtractor): title = mobj.group('title') webpage = self._download_webpage(url, title) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + return self.url_result(smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['DE']}), + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 24c478932..79405b468 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -269,6 +269,7 @@ from .discoverygo import ( DiscoveryGoIE, DiscoveryGoPlaylistIE, ) +from .discoverynetworks import DiscoveryNetworksDeIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE from .dropbox import DropboxIE @@ -973,7 +974,6 @@ from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE from .tinypic import TinyPicIE -from .tlc import TlcDeIE from .tmz import ( TMZIE, TMZArticleIE, From e7a51a4c0235fafefc672d753017c770a306677a Mon Sep 17 00:00:00 2001 From: mrBliss <dewinant@gmail.com> Date: Tue, 31 Jan 2017 13:59:18 +0100 Subject: [PATCH 019/200] [vtm] Add extractor (closes #9974) Implementation of the approach described in #9974. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/vtm.py | 136 +++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 youtube_dl/extractor/vtm.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 79405b468..64316d4a8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1174,6 +1174,7 @@ from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE +from .vtm import VTMIE from .vube import VubeIE from .vuclip import VuClipIE from .vvvvid import VVVVIDIE diff --git a/youtube_dl/extractor/vtm.py b/youtube_dl/extractor/vtm.py new file mode 100644 index 000000000..f0a70040b --- /dev/null +++ b/youtube_dl/extractor/vtm.py @@ -0,0 +1,136 @@ +from __future__ import unicode_literals + +import re + +from .generic import GenericIE +from .common import InfoExtractor +from ..utils import ( + urlencode_postdata, + compat_urllib_parse_urlencode, + ExtractorError, + remove_end, +) + + +class VTMIE(InfoExtractor): + """Download full episodes that require an account from vtm.be or q2.be. + + The generic extractor can be used to download clips that do no require an + account. + """ + _VALID_URL = r'https?://(?:www\.)?(?P<site_id>vtm|q2)\.be/video[/?].+?' + _NETRC_MACHINE = 'vtm' + _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' + _TESTS = [ + { + 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', + 'info_dict': { + 'id': 'vtm_20170219_VM0678361_vtmwatch', + 'ext': 'mp4', + 'title': 'Allemaal Chris afl. 6', + 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', + }, + 'skip_download': True, + }, + { + 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', + 'only_matching': True, + }, + { + 'url': 'http://vtm.be/video?aid=163157', + 'only_matching': True, + }, + { + 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', + 'only_matching': True, + }, + { + 'url': 'http://vtm.be/video?aid=168332', + 'info_dict': { + 'id': 'video?aid=168332', + 'ext': 'mp4', + 'title': 'Videozone', + }, + }, + ] + + def _real_initialize(self): + self._logged_in = False + + def _login(self): + (username, password) = self._get_login_info() + if username is None or password is None: + self.raise_login_required() + + auth_data = { + 'APIKey': self._APIKEY, + 'sdk': 'js_6.1', + 'format': 'json', + 'loginID': username, + 'password': password, + } + + auth_info = self._download_json( + 'https://accounts.eu1.gigya.com/accounts.login', None, + note='Logging in', errnote='Unable to log in', + data=urlencode_postdata(auth_data), fatal=True) + + error_message = auth_info.get('errorDetails') + if error_message: + raise ExtractorError( + 'Unable to login: %s' % error_message, expected=True) + + self._uid = auth_info['UID'] + self._uid_signature = auth_info['UIDSignature'] + self._signature_timestamp = auth_info['signatureTimestamp'] + + self._logged_in = True + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site_id = mobj.group('site_id') + + webpage = self._download_webpage(url, None, "Downloading webpage") + + # The URL sometimes contains the video id, but not always, e.g., test + # case 3. Fortunately, all webpages of videos requiring authentication + # contain the video id. + video_id = self._search_regex( + r'\\"vodId\\":\\"(.+?)\\"', webpage, 'video_id', default=None) + + # It was most likely a video not requiring authentication. + if not video_id: + return self.url_result(url, 'Generic') + + if not self._logged_in: + self._login() + + title = self._html_search_regex( + r'\\"title\\":\\"(.+?)\\"', webpage, 'title', default=None) + + description = self._html_search_regex( + r'<div[^>]+class="field-item\s+even">\s*<p>(.+?)</p>', + webpage, 'description', default=None) + + data_url = 'http://vod.medialaan.io/api/1.0/item/%s/video' % video_id + m3u8_data = { + 'app_id': 'vtm_watch' if site_id == 'vtm' else 'q2', + 'user_network': 'vtm-sso', + 'UID': self._uid, + 'UIDSignature': self._uid_signature, + 'signatureTimestamp': self._signature_timestamp, + } + data = self._download_json(data_url, video_id, query=m3u8_data) + + formats = self._extract_m3u8_formats( + data['response']['uri'], video_id, entry_protocol='m3u8_native', + ext='mp4', m3u8_id='hls') + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + } From 2a721cdff2da0a9267c96ff2f4c19cda4ce0ab83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Mar 2017 05:58:54 +0700 Subject: [PATCH 020/200] [medialaan] Fix and improve extraction (closes #11912) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/medialaan.py | 263 +++++++++++++++++++++++++++++ youtube_dl/extractor/vtm.py | 136 --------------- 3 files changed, 264 insertions(+), 137 deletions(-) create mode 100644 youtube_dl/extractor/medialaan.py delete mode 100644 youtube_dl/extractor/vtm.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 64316d4a8..6b4742ed8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1174,7 +1174,7 @@ from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE -from .vtm import VTMIE +from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE from .vvvvid import VVVVIDIE diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py new file mode 100644 index 000000000..e70d4679d --- /dev/null +++ b/youtube_dl/extractor/medialaan.py @@ -0,0 +1,263 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_duration, + try_get, + unified_timestamp, + urlencode_postdata, +) + + +class MedialaanIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + (?P<site_id>vtm|q2|vtmkzoom)\.be/ + (?: + video(?:/[^/]+/id/|/?\?.*?\baid=)| + (?:[^/]+/)* + ) + ) + (?P<id>[^/?#&]+) + ''' + _NETRC_MACHINE = 'medialaan' + _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' + _SITE_TO_APP_ID = { + 'vtm': 'vtm_watch', + 'q2': 'q2', + 'vtmkzoom': 'vtmkzoom', + } + _TESTS = [{ + # vod + 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', + 'info_dict': { + 'id': 'vtm_20170219_VM0678361_vtmwatch', + 'ext': 'mp4', + 'title': 'Allemaal Chris afl. 6', + 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', + 'timestamp': 1487533280, + 'upload_date': '20170219', + 'duration': 2562, + 'series': 'Allemaal Chris', + 'season': 'Allemaal Chris', + 'season_number': 1, + 'season_id': '256936078124527', + 'episode': 'Allemaal Chris afl. 6', + 'episode_number': 6, + 'episode_id': '256936078591527', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires account credentials', + }, { + # clip + 'url': 'http://vtm.be/video?aid=168332', + 'info_dict': { + 'id': '168332', + 'ext': 'mp4', + 'title': '"Veronique liegt!"', + 'description': 'md5:1385e2b743923afe54ba4adc38476155', + 'timestamp': 1489002029, + 'upload_date': '20170308', + 'duration': 96, + }, + }, { + # vod + 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', + 'only_matching': True, + }, { + # vod + 'url': 'http://vtm.be/video?aid=163157', + 'only_matching': True, + }, { + # vod + 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', + 'only_matching': True, + }, { + # clip + 'url': 'http://vitaya.be/de-jurk/precies-je-hebt-geen-borsten', + 'only_matching': True, + }, { + # clip + 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', + 'only_matching': True, + }] + + def _real_initialize(self): + self._logged_in = False + + def _login(self): + username, password = self._get_login_info() + if username is None: + self.raise_login_required() + + auth_data = { + 'APIKey': self._APIKEY, + 'sdk': 'js_6.1', + 'format': 'json', + 'loginID': username, + 'password': password, + } + + auth_info = self._download_json( + 'https://accounts.eu1.gigya.com/accounts.login', None, + note='Logging in', errnote='Unable to log in', + data=urlencode_postdata(auth_data)) + + error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage') + if error_message: + raise ExtractorError( + 'Unable to login: %s' % error_message, expected=True) + + self._uid = auth_info['UID'] + self._uid_signature = auth_info['UIDSignature'] + self._signature_timestamp = auth_info['signatureTimestamp'] + + self._logged_in = True + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, site_id = mobj.group('id', 'site_id') + + webpage = self._download_webpage(url, video_id) + + config = self._parse_json( + self._search_regex( + r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);', + webpage, 'config', default='{}'), video_id, + transform_source=lambda s: s.replace( + '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'")) + + vod_id = config.get('vodId') or self._search_regex( + (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', + r'<[^>]+id=["\']vod-(\d+)'), + webpage, 'video_id', default=None) + + # clip, no authentication required + if not vod_id: + player = self._parse_json( + self._search_regex( + r'vmmaplayer\(({.+?})\);', webpage, 'vmma player', + default=''), + video_id, transform_source=lambda s: '[%s]' % s, fatal=False) + if player: + video = player[-1] + info = { + 'id': video_id, + 'url': video['videoUrl'], + 'title': video['title'], + 'thumbnail': video.get('imageUrl'), + 'timestamp': int_or_none(video.get('createdDate')), + 'duration': int_or_none(video.get('duration')), + } + else: + info = self._parse_html5_media_entries( + url, webpage, video_id, m3u8_id='hls')[0] + info.update({ + 'id': video_id, + 'title': self._html_search_meta('description', webpage), + 'duration': parse_duration(self._html_search_meta('duration', webpage)), + }) + # vod, authentication required + else: + if not self._logged_in: + self._login() + + settings = self._parse_json( + self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings', default='{}'), + video_id) + + def get(container, item): + return try_get( + settings, lambda x: x[container][item], + compat_str) or self._search_regex( + r'"%s"\s*:\s*"([^"]+)' % item, webpage, item, + default=None) + + app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch') + sso = get('vod', 'gigyaDatabase') or 'vtm-sso' + + data = self._download_json( + 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id, + video_id, query={ + 'app_id': app_id, + 'user_network': sso, + 'UID': self._uid, + 'UIDSignature': self._uid_signature, + 'signatureTimestamp': self._signature_timestamp, + }) + + formats = self._extract_m3u8_formats( + data['response']['uri'], video_id, entry_protocol='m3u8_native', + ext='mp4', m3u8_id='hls') + + self._sort_formats(formats) + + info = { + 'id': vod_id, + 'formats': formats, + } + + api_key = get('vod', 'apiKey') + channel = get('medialaanGigya', 'channel') + + if api_key: + videos = self._download_json( + 'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False, + query={ + 'channels': channel, + 'ids': vod_id, + 'limit': 1, + 'apikey': api_key, + }) + if videos: + video = try_get( + videos, lambda x: x['response']['videos'][0], dict) + if video: + def get(container, item, expected_type=None): + return try_get( + video, lambda x: x[container][item], expected_type) + + def get_string(container, item): + return get(container, item, compat_str) + + info.update({ + 'series': get_string('program', 'title'), + 'season': get_string('season', 'title'), + 'season_number': int_or_none(get('season', 'number')), + 'season_id': get_string('season', 'id'), + 'episode': get_string('episode', 'title'), + 'episode_number': int_or_none(get('episode', 'number')), + 'episode_id': get_string('episode', 'id'), + 'duration': int_or_none( + video.get('duration')) or int_or_none( + video.get('durationMillis'), scale=1000), + 'title': get_string('episode', 'title'), + 'description': get_string('episode', 'text'), + 'timestamp': unified_timestamp(get_string( + 'publication', 'begin')), + }) + + if not info.get('title'): + info['title'] = try_get( + config, lambda x: x['videoConfig']['title'], + compat_str) or self._html_search_regex( + r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title', + default=None) or self._og_search_title(webpage) + + if not info.get('description'): + info['description'] = self._html_search_regex( + r'<div[^>]+class="field-item\s+even">\s*<p>(.+?)</p>', + webpage, 'description', default=None) + + return info diff --git a/youtube_dl/extractor/vtm.py b/youtube_dl/extractor/vtm.py deleted file mode 100644 index f0a70040b..000000000 --- a/youtube_dl/extractor/vtm.py +++ /dev/null @@ -1,136 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .generic import GenericIE -from .common import InfoExtractor -from ..utils import ( - urlencode_postdata, - compat_urllib_parse_urlencode, - ExtractorError, - remove_end, -) - - -class VTMIE(InfoExtractor): - """Download full episodes that require an account from vtm.be or q2.be. - - The generic extractor can be used to download clips that do no require an - account. - """ - _VALID_URL = r'https?://(?:www\.)?(?P<site_id>vtm|q2)\.be/video[/?].+?' - _NETRC_MACHINE = 'vtm' - _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' - _TESTS = [ - { - 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', - 'info_dict': { - 'id': 'vtm_20170219_VM0678361_vtmwatch', - 'ext': 'mp4', - 'title': 'Allemaal Chris afl. 6', - 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', - }, - 'skip_download': True, - }, - { - 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', - 'only_matching': True, - }, - { - 'url': 'http://vtm.be/video?aid=163157', - 'only_matching': True, - }, - { - 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', - 'only_matching': True, - }, - { - 'url': 'http://vtm.be/video?aid=168332', - 'info_dict': { - 'id': 'video?aid=168332', - 'ext': 'mp4', - 'title': 'Videozone', - }, - }, - ] - - def _real_initialize(self): - self._logged_in = False - - def _login(self): - (username, password) = self._get_login_info() - if username is None or password is None: - self.raise_login_required() - - auth_data = { - 'APIKey': self._APIKEY, - 'sdk': 'js_6.1', - 'format': 'json', - 'loginID': username, - 'password': password, - } - - auth_info = self._download_json( - 'https://accounts.eu1.gigya.com/accounts.login', None, - note='Logging in', errnote='Unable to log in', - data=urlencode_postdata(auth_data), fatal=True) - - error_message = auth_info.get('errorDetails') - if error_message: - raise ExtractorError( - 'Unable to login: %s' % error_message, expected=True) - - self._uid = auth_info['UID'] - self._uid_signature = auth_info['UIDSignature'] - self._signature_timestamp = auth_info['signatureTimestamp'] - - self._logged_in = True - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site_id = mobj.group('site_id') - - webpage = self._download_webpage(url, None, "Downloading webpage") - - # The URL sometimes contains the video id, but not always, e.g., test - # case 3. Fortunately, all webpages of videos requiring authentication - # contain the video id. - video_id = self._search_regex( - r'\\"vodId\\":\\"(.+?)\\"', webpage, 'video_id', default=None) - - # It was most likely a video not requiring authentication. - if not video_id: - return self.url_result(url, 'Generic') - - if not self._logged_in: - self._login() - - title = self._html_search_regex( - r'\\"title\\":\\"(.+?)\\"', webpage, 'title', default=None) - - description = self._html_search_regex( - r'<div[^>]+class="field-item\s+even">\s*<p>(.+?)</p>', - webpage, 'description', default=None) - - data_url = 'http://vod.medialaan.io/api/1.0/item/%s/video' % video_id - m3u8_data = { - 'app_id': 'vtm_watch' if site_id == 'vtm' else 'q2', - 'user_network': 'vtm-sso', - 'UID': self._uid, - 'UIDSignature': self._uid_signature, - 'signatureTimestamp': self._signature_timestamp, - } - data = self._download_json(data_url, video_id, query=m3u8_data) - - formats = self._extract_m3u8_formats( - data['response']['uri'], video_id, entry_protocol='m3u8_native', - ext='mp4', m3u8_id='hls') - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - } From 5f0daab1ca60803f4f49b344ddb3757c418a2d8e Mon Sep 17 00:00:00 2001 From: Vijay Singh <sudovijay@users.noreply.github.com> Date: Sat, 18 Mar 2017 04:32:55 +0530 Subject: [PATCH 021/200] [openload] Fix extraction --- youtube_dl/extractor/openload.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index fa876b127..435aec28e 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -80,8 +80,8 @@ class OpenloadIE(InfoExtractor): first_char = ord(ol_id[0]) key = first_char - 55 maxKey = max(2, key) - key = min(maxKey, len(ol_id) - 26) - t = ol_id[key:key + 24] + key = min(maxKey, len(ol_id) - 38) + t = ol_id[key:key + 36] hashMap = {} v = ol_id.replace(t, '') @@ -110,7 +110,7 @@ class OpenloadIE(InfoExtractor): elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60: i = int(C, 10) h += 1 - index = H % 8 + index = H % 12 A = hashMap[index] i ^= 213 From febfe1e2626bab5dbb8d4e0bbe31aa225ce09d35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Mar 2017 06:19:11 +0700 Subject: [PATCH 022/200] [adobepass] Detect and output error on authz token extraction (#12472) --- youtube_dl/extractor/adobepass.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index d4816abf5..1b2d364ca 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -1458,6 +1458,8 @@ class AdobePassIE(InfoExtractor): self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) count += 1 continue + if '<error' in authorize: + raise ExtractorError(xml_text(authorize, 'details'), expected=True) authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) requestor_info[guid] = authz_token self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) From f68ef1e2abd876ffca65544fd3e42756f9c33be3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Mar 2017 23:23:47 +0700 Subject: [PATCH 023/200] [medialaan] Remove unrelated test --- youtube_dl/extractor/medialaan.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index e70d4679d..6e067474b 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -81,10 +81,6 @@ class MedialaanIE(InfoExtractor): # vod 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', 'only_matching': True, - }, { - # clip - 'url': 'http://vitaya.be/de-jurk/precies-je-hebt-geen-borsten', - 'only_matching': True, }, { # clip 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', From 772b5ff57f702dd76986d1db17068da2116a2800 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 19 Mar 2017 00:45:04 +0100 Subject: [PATCH 024/200] [toongoggles] Add new extractor(closes #12171) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/toongoggles.py | 81 +++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 youtube_dl/extractor/toongoggles.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6b4742ed8..97d68d9ca 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -986,6 +986,7 @@ from .tnaflix import ( ) from .toggle import ToggleIE from .tonline import TOnlineIE +from .toongoggles import ToonGogglesIE from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE diff --git a/youtube_dl/extractor/toongoggles.py b/youtube_dl/extractor/toongoggles.py new file mode 100644 index 000000000..b5ba1c01d --- /dev/null +++ b/youtube_dl/extractor/toongoggles.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, +) + + +class ToonGogglesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?toongoggles\.com/shows/(?P<show_id>\d+)(?:/[^/]+/episodes/(?P<episode_id>\d+))?' + _TESTS = [{ + 'url': 'http://www.toongoggles.com/shows/217143/bernard-season-2/episodes/217147/football', + 'md5': '18289fc2b951eff6b953a9d8f01e6831', + 'info_dict': { + 'id': '217147', + 'ext': 'mp4', + 'title': 'Football', + 'uploader_id': '1', + 'description': 'Bernard decides to play football in order to be better than Lloyd and tries to beat him no matter how, he even cheats.', + 'upload_date': '20160718', + 'timestamp': 1468879330, + } + }, { + 'url': 'http://www.toongoggles.com/shows/227759/om-nom-stories-around-the-world', + 'info_dict': { + 'id': '227759', + 'title': 'Om Nom Stories Around The World', + }, + 'playlist_mincount': 11, + }] + + def _call_api(self, action, page_id, query): + query.update({ + 'for_ng': 1, + 'for_web': 1, + 'show_meta': 1, + 'version': 7.0, + }) + return self._download_json('http://api.toongoggles.com/' + action, page_id, query=query) + + def _parse_episode_data(self, episode_data): + title = episode_data['episode_name'] + + return { + '_type': 'url_transparent', + 'id': episode_data['episode_id'], + 'title': title, + 'url': 'kaltura:513551:' + episode_data['entry_id'], + 'thumbnail': episode_data.get('thumbnail_url'), + 'description': episode_data.get('description'), + 'duration': parse_duration(episode_data.get('hms')), + 'series': episode_data.get('show_name'), + 'season_number': int_or_none(episode_data.get('season_num')), + 'episode_id': episode_data.get('episode_id'), + 'episode': title, + 'episode_number': int_or_none(episode_data.get('episode_num')), + 'categories': episode_data.get('categories'), + 'ie_key': 'Kaltura', + } + + def _real_extract(self, url): + show_id, episode_id = re.match(self._VALID_URL, url).groups() + if episode_id: + episode_data = self._call_api('search', episode_id, { + 'filter': 'episode', + 'id': episode_id, + })['objects'][0] + return self._parse_episode_data(episode_data) + else: + show_data = self._call_api('getepisodesbyshow', show_id, { + 'max': 1000000000, + 'showid': show_id, + }) + entries = [] + for episode_data in show_data.get('objects', []): + entries.append(self._parse_episode_data(episode_data)) + return self.playlist_result(entries, show_id, show_data.get('show_name')) From 46b18f2349670d395b9d84a57ee3d9b5d221ff4b Mon Sep 17 00:00:00 2001 From: John Hawkinson <jhawk@mit.edu> Date: Wed, 8 Mar 2017 18:13:54 -0500 Subject: [PATCH 025/200] [BostonGlobe] New. Nonstandard version of Brightcove. Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise pretty much just Brightcove. Except the Globe isn't all Brightcove videos, so fallback to Generic, too. Also, abstract playlist_from_matches() from generic.py to common.py, and use it here. History of these changes can be found in 51170427d4b1143572a498dedaee61863a5b2c5b. --- youtube_dl/extractor/bostonglobe.py | 72 +++++++++++++++++++++++++++ youtube_dl/extractor/common.py | 28 +++++++---- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 77 +++++++++++++---------------- 4 files changed, 126 insertions(+), 52 deletions(-) create mode 100644 youtube_dl/extractor/bostonglobe.py diff --git a/youtube_dl/extractor/bostonglobe.py b/youtube_dl/extractor/bostonglobe.py new file mode 100644 index 000000000..57882fbee --- /dev/null +++ b/youtube_dl/extractor/bostonglobe.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + extract_attributes, +) + + +class BostonGlobeIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?' + _TESTS = [ + { + 'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html', + 'md5': '0a62181079c85c2d2b618c9a738aedaf', + 'info_dict': { + 'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood', + 'id': '5320421710001', + 'ext': 'mp4', + 'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.', + 'timestamp': 1486877593, + 'upload_date': '20170212', + 'uploader_id': '245991542', + }, + }, + { + # Embedded youtube video; we hand it off to the Generic extractor. + 'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html', + 'md5': '582b40327089d5c0c949b3c54b13c24b', + 'info_dict': { + 'title': "Who Is Matt Damon's Favorite Batman?", + 'id': 'ZW1QCnlA6Qc', + 'ext': 'mp4', + 'upload_date': '20170217', + 'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb', + 'uploader': 'The Late Late Show with James Corden', + 'uploader_id': 'TheLateLateShow', + }, + 'expected_warnings': ['404'], + }, + ] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + page_title = self._og_search_title(webpage, default=None) + + # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> + entries = [] + for video in re.findall(r'(?i)(<video[^>]+>)', webpage): + attrs = extract_attributes(video) + + video_id = attrs.get('data-brightcove-video-id') + account_id = attrs.get('data-account') + player_id = attrs.get('data-player') + embed = attrs.get('data-embed') + + if video_id and account_id and player_id and embed: + entries.append( + 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' + % (account_id, player_id, embed, video_id)) + + if len(entries) == 0: + return self.url_result(url, 'Generic') + elif len(entries) == 1: + return self.url_result(entries[0], 'BrightcoveNew') + else: + return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b51799bfa..0852b8e8c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -36,34 +36,35 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + determine_protocol, error_to_compat_str, ExtractorError, + extract_attributes, fix_xml_ampersands, float_or_none, GeoRestrictedError, GeoUtils, int_or_none, js_to_json, + mimetype2ext, + orderedSet, + parse_codecs, + parse_duration, parse_iso8601, + parse_m3u8_attributes, RegexNotFoundError, - sanitize_filename, sanitized_Request, + sanitize_filename, unescapeHTML, unified_strdate, unified_timestamp, + update_Request, + update_url_query, + urljoin, url_basename, xpath_element, xpath_text, xpath_with_ns, - determine_protocol, - parse_duration, - mimetype2ext, - update_Request, - update_url_query, - parse_m3u8_attributes, - extract_attributes, - parse_codecs, - urljoin, ) @@ -714,6 +715,13 @@ class InfoExtractor(object): video_info['title'] = video_title return video_info + def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None): + urlrs = orderedSet( + self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) + for m in matches) + return self.playlist_result( + urlrs, playlist_id=video_id, playlist_title=video_title) + @staticmethod def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): """Returns a playlist""" diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 97d68d9ca..40a5c9842 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -117,6 +117,7 @@ from .bleacherreport import ( from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE +from .bostonglobe import BostonGlobeIE from .bpb import BpbIE from .br import BRIE from .bravotv import BravoTVIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0fcb3fdac..a71d6bac0 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1841,14 +1841,6 @@ class GenericIE(InfoExtractor): video_description = self._og_search_description(webpage, default=None) video_thumbnail = self._og_search_thumbnail(webpage, default=None) - # Helper method - def _playlist_from_matches(matches, getter=None, ie=None): - urlrs = orderedSet( - self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) - for m in matches) - return self.playlist_result( - urlrs, playlist_id=video_id, playlist_title=video_title) - # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: @@ -1869,28 +1861,28 @@ class GenericIE(InfoExtractor): # Look for Brightcove New Studio embeds bc_urls = BrightcoveNewIE._extract_urls(webpage) if bc_urls: - return _playlist_from_matches(bc_urls, ie='BrightcoveNew') + return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') # Look for ThePlatform embeds tp_urls = ThePlatformIE._extract_urls(webpage) if tp_urls: - return _playlist_from_matches(tp_urls, ie='ThePlatform') + return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') # Look for Vessel embeds vessel_urls = VesselIE._extract_urls(webpage) if vessel_urls: - return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) + return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) # Look for embedded rtl.nl player matches = re.findall( r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', webpage) if matches: - return _playlist_from_matches(matches, ie='RtlNl') + return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') vimeo_urls = VimeoIE._extract_urls(url, webpage) if vimeo_urls: - return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key()) + return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', @@ -1912,25 +1904,25 @@ class GenericIE(InfoExtractor): (?:embed|v|p)/.+?) \1''', webpage) if matches: - return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1])) + return self.playlist_from_matches( + matches, video_id, video_title, lambda m: unescapeHTML(m[1])) # Look for lazyYT YouTube embed matches = re.findall( r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) if matches: - return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) + return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) # Look for Wordpress "YouTube Video Importer" plugin matches = re.findall(r'''(?x)<div[^>]+ class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) if matches: - return _playlist_from_matches(matches, lambda m: m[-1]) + return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) matches = DailymotionIE._extract_urls(webpage) if matches: - return _playlist_from_matches(matches) + return self.playlist_from_matches(matches, video_id, video_title) # Look for embedded Dailymotion playlist player (#3822) m = re.search( @@ -1939,8 +1931,8 @@ class GenericIE(InfoExtractor): playlists = re.findall( r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) if playlists: - return _playlist_from_matches( - playlists, lambda p: '//dailymotion.com/playlist/%s' % p) + return self.playlist_from_matches( + playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) # Look for embedded Wistia player match = re.search( @@ -2047,8 +2039,9 @@ class GenericIE(InfoExtractor): if mobj is not None: embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: - return _playlist_from_matches( - embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') + return self.playlist_from_matches( + embeds, video_id, video_title, + getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') # Look for Aparat videos mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) @@ -2110,13 +2103,13 @@ class GenericIE(InfoExtractor): # Look for funnyordie embed matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) if matches: - return _playlist_from_matches( - matches, getter=unescapeHTML, ie='FunnyOrDie') + return self.playlist_from_matches( + matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') # Look for BBC iPlayer embed matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) if matches: - return _playlist_from_matches(matches, ie='BBCCoUk') + return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') # Look for embedded RUTV player rutv_url = RUTVIE._extract_url(webpage) @@ -2131,32 +2124,32 @@ class GenericIE(InfoExtractor): # Look for embedded SportBox player sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) if sportbox_urls: - return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') + return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed') # Look for embedded XHamster player xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) if xhamster_urls: - return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') + return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') # Look for embedded TNAFlixNetwork player tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) if tnaflix_urls: - return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) + return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) # Look for embedded PornHub player pornhub_urls = PornHubIE._extract_urls(webpage) if pornhub_urls: - return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key()) + return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) # Look for embedded DrTuber player drtuber_urls = DrTuberIE._extract_urls(webpage) if drtuber_urls: - return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key()) + return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) # Look for embedded RedTube player redtube_urls = RedTubeIE._extract_urls(webpage) if redtube_urls: - return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key()) + return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) # Look for embedded Tvigle player mobj = re.search( @@ -2202,12 +2195,12 @@ class GenericIE(InfoExtractor): # Look for embedded soundcloud player soundcloud_urls = SoundcloudIE._extract_urls(webpage) if soundcloud_urls: - return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) + return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) # Look for tunein player tunein_urls = TuneInBaseIE._extract_urls(webpage) if tunein_urls: - return _playlist_from_matches(tunein_urls) + return self.playlist_from_matches(tunein_urls, video_id, video_title) # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) @@ -2490,35 +2483,35 @@ class GenericIE(InfoExtractor): # Look for DBTV embeds dbtv_urls = DBTVIE._extract_urls(webpage) if dbtv_urls: - return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key()) + return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) # Look for Videa embeds videa_urls = VideaIE._extract_urls(webpage) if videa_urls: - return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key()) + return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) # Look for 20 minuten embeds twentymin_urls = TwentyMinutenIE._extract_urls(webpage) if twentymin_urls: - return _playlist_from_matches( - twentymin_urls, ie=TwentyMinutenIE.ie_key()) + return self.playlist_from_matches( + twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) # Look for Openload embeds openload_urls = OpenloadIE._extract_urls(webpage) if openload_urls: - return _playlist_from_matches( - openload_urls, ie=OpenloadIE.ie_key()) + return self.playlist_from_matches( + openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) # Look for VideoPress embeds videopress_urls = VideoPressIE._extract_urls(webpage) if videopress_urls: - return _playlist_from_matches( - videopress_urls, ie=VideoPressIE.ie_key()) + return self.playlist_from_matches( + videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) # Look for Rutube embeds rutube_urls = RutubeIE._extract_urls(webpage) if rutube_urls: - return _playlist_from_matches( + return self.playlist_from_matches( rutube_urls, ie=RutubeIE.ie_key()) # Looking for http://schema.org/VideoObject From 68220649fa0b1c06c16a80ce51cc21f8d3264a4c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 19 Mar 2017 20:42:17 +0800 Subject: [PATCH 026/200] [ChangeLog] Update after #12099 --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index eeb5813c5..d70637b69 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors ++ [bostonglobe] Add extractor for bostonglobe.com (#12099) * [openload] Fix extraction (#10408) From 45e6ad21b4f024c1721dc3dd2b53f15d7efa8aa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 19 Mar 2017 23:48:02 +0700 Subject: [PATCH 027/200] Credit @mrBliss for vtm (#11912) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 273a6a034..2d676b210 100644 --- a/AUTHORS +++ b/AUTHORS @@ -209,3 +209,4 @@ Olivier Bilodeau Lars Vierbergen Juanjo Benages Xiao Di Guan +Thomas Winant From 9487ce03e998337cbc69db250009e11b52c3b255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 19 Mar 2017 23:59:40 +0700 Subject: [PATCH 028/200] [YoutubeDL] Allow multiple input URLs to be used with stdout as output template --- youtube_dl/YoutubeDL.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 13a3a909e..cb502c26f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1872,6 +1872,7 @@ class YoutubeDL(object): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and + outtmpl != '-' and '%' not in outtmpl and self.params.get('max_downloads') != 1): raise SameFileError(outtmpl) From 0ecdd3adbd104786c901944a316b87f58056bcdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 20 Mar 2017 00:03:58 +0700 Subject: [PATCH 029/200] [ChangeLog] Actualize --- ChangeLog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ChangeLog b/ChangeLog index d70637b69..5a316f0ac 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,15 @@ version <unreleased> +Core ++ [YoutubeDL] Allow multiple input URLs to be used with stdout (-) as + output template ++ [adobepass] Detect and output error on authz token extraction (#12472) + Extractors + [bostonglobe] Add extractor for bostonglobe.com (#12099) ++ [toongoggles] Add support for toongoggles.com (#12171) ++ [medialaan] Add support for Medialaan sites (#9974, #11912) ++ [discoverynetworks] Add support for more domains and bypass geo restiction * [openload] Fix extraction (#10408) From 0e9a73e6120965fc2c2a1a2a1a30f7d38af4c73a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 20 Mar 2017 00:07:57 +0700 Subject: [PATCH 030/200] release 2017.03.20 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 5 ++++- youtube_dl/version.py | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 0e94b6cde..4273fedbf 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.16*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.16** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.20*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.20** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.03.16 +[debug] youtube-dl version 2017.03.20 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 5a316f0ac..bbbf3c34d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.03.20 Core + [YoutubeDL] Allow multiple input URLs to be used with stdout (-) as diff --git a/docs/supportedsites.md b/docs/supportedsites.md index cc0309f97..6a7be28cb 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -108,6 +108,7 @@ - **blinkx** - **Bloomberg** - **BokeCC** + - **BostonGlobe** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk Mediathek - **BravoTV** @@ -209,6 +210,7 @@ - **Discovery** - **DiscoveryGo** - **DiscoveryGoPlaylist** + - **DiscoveryNetworksDe** - **Disney** - **Dotsub** - **DouyuTV**: 斗鱼 @@ -425,6 +427,7 @@ - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** + - **Medialaan** - **Meipai**: 美拍 - **MelonVOD** - **META** @@ -777,12 +780,12 @@ - **ThisAV** - **ThisOldHouse** - **tinypic**: tinypic.com videos - - **tlc.de** - **TMZ** - **TMZArticle** - **TNAFlix** - **TNAFlixNetworkEmbed** - **toggle** + - **ToonGoggles** - **Tosh**: Tosh.0 - **tou.tv** - **Toypics**: Toypics user profile diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f38f130bf..a65f2e741 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.03.16' +__version__ = '2017.03.20' From 957f453429d584615ac4d2277caeb0d75d0fe1d9 Mon Sep 17 00:00:00 2001 From: Vijay Singh <sudovijay@users.noreply.github.com> Date: Mon, 20 Mar 2017 09:22:32 +0530 Subject: [PATCH 031/200] [Openload.co] Fixed Extraction They did it again. just a minor change though. here's quick fix --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 435aec28e..58ffde541 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -110,7 +110,7 @@ class OpenloadIE(InfoExtractor): elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60: i = int(C, 10) h += 1 - index = H % 12 + index = H % 7 A = hashMap[index] i ^= 213 From 8a8cc339b6b5189b3c1fdb15ef7224c035b21a6b Mon Sep 17 00:00:00 2001 From: John Hawkinson <jhawk@mit.edu> Date: Mon, 20 Mar 2017 11:35:13 -0400 Subject: [PATCH 032/200] [senateisvp] Allow https URL scheme for embeds --- youtube_dl/extractor/senateisvp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 387a4f7f6..db5ef8b57 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -89,7 +89,7 @@ class SenateISVPIE(InfoExtractor): @staticmethod def _search_iframe_url(webpage): mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", webpage) if mobj: return mobj.group('url') From 97952bdb78854bf09c688eb535dc7b67265934c1 Mon Sep 17 00:00:00 2001 From: John Hawkinson <jhawk@mit.edu> Date: Tue, 21 Mar 2017 13:12:14 -0400 Subject: [PATCH 033/200] [generic] Add test for Senate ISVP iframe embed --- youtube_dl/extractor/generic.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a71d6bac0..cb6308d29 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1542,6 +1542,17 @@ class GenericIE(InfoExtractor): 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', 'only_matching': True, }, + { + # Senate ISVP iframe https + 'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security', + 'md5': 'fb8c70b0b515e5037981a2492099aab8', + 'info_dict': { + 'id': 'govtaff020316', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + }, + 'add_ie': [SenateISVPIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject From 21fbf0f955f584ad2d02608850a69a2fd74b65a6 Mon Sep 17 00:00:00 2001 From: Throaway <Throaway@null.com> Date: Mon, 20 Mar 2017 16:29:39 -0700 Subject: [PATCH 034/200] [pornhub] Decode obfuscated video URL (closes #12470) --- youtube_dl/extractor/pornhub.py | 37 ++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 9b413590a..eb316ad14 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -1,7 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import operator # import os import re @@ -129,9 +131,38 @@ class PornHubIE(InfoExtractor): tv_webpage = dl_webpage('tv') - video_url = self._search_regex( - r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage, - 'video url', group='url') + encoded_url = self._search_regex(r'(var.*mediastring.*)</script>', + tv_webpage, 'encoded url') + assignments = encoded_url.split(";") + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*[^*]*\*/', "", inp) + + if "+" in inp: + inps = inp.split("+") + return functools.reduce(operator.concat, map(parse_js_value, inps)) + + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + + # Hope it's a string! + assert inp.startswith('"') and inp.endswith('"') + return inp[1:-1] + + for assn in assignments: + assn = assn.strip() + if len(assn) == 0: + continue + + assert assn.startswith("var ") + assn = assn[4:] + vname, value = assn.split("=", 1) + + js_vars[vname] = parse_js_value(value) + + video_url = js_vars["mediastring"] title = self._search_regex( r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) From e1e35d1ac66ab99202e8265ac811906de2aa87dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Mar 2017 01:59:27 +0700 Subject: [PATCH 035/200] [pornhub] Improve extraction and style (closes #12515) --- youtube_dl/extractor/pornhub.py | 35 ++++++++++++++------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index eb316ad14..b25f1f193 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -20,6 +20,7 @@ from ..utils import ( js_to_json, orderedSet, # sanitized_Request, + remove_quotes, str_to_int, ) # from ..aes import ( @@ -131,38 +132,32 @@ class PornHubIE(InfoExtractor): tv_webpage = dl_webpage('tv') - encoded_url = self._search_regex(r'(var.*mediastring.*)</script>', - tv_webpage, 'encoded url') - assignments = encoded_url.split(";") + assignments = self._search_regex( + r'(var.+?mediastring.+?)</script>', tv_webpage, + 'encoded url').split(';') + js_vars = {} def parse_js_value(inp): - inp = re.sub(r'/\*[^*]*\*/', "", inp) - - if "+" in inp: - inps = inp.split("+") - return functools.reduce(operator.concat, map(parse_js_value, inps)) - + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) inp = inp.strip() if inp in js_vars: return js_vars[inp] - - # Hope it's a string! - assert inp.startswith('"') and inp.endswith('"') - return inp[1:-1] + return remove_quotes(inp) for assn in assignments: assn = assn.strip() - if len(assn) == 0: + if not assn: continue - - assert assn.startswith("var ") - assn = assn[4:] - vname, value = assn.split("=", 1) - + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) js_vars[vname] = parse_js_value(value) - video_url = js_vars["mediastring"] + video_url = js_vars['mediastring'] title = self._search_regex( r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) From 8e5a7c5e67a8fad446d22a7619cd6a09823a05e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Mar 2017 02:28:04 +0700 Subject: [PATCH 036/200] [pluralsight] Omit module title from video title (closes #12506) --- youtube_dl/extractor/pluralsight.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index e0cbd045e..0c6e036ca 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -40,7 +40,7 @@ class PluralsightIE(PluralsightBaseIE): 'info_dict': { 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', 'ext': 'mp4', - 'title': 'Management of SQL Server - Demo Monitoring', + 'title': 'Demo Monitoring', 'duration': 338, }, 'skip': 'Requires pluralsight account credentials', @@ -187,7 +187,7 @@ class PluralsightIE(PluralsightBaseIE): if not clip: raise ExtractorError('Unable to resolve clip') - title = '%s - %s' % (module['title'], clip['title']) + title = clip['title'] QUALITIES = { 'low': {'width': 640, 'height': 480}, From e8686e51d77607347802f82c57278e7d675d022c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Mar 2017 02:35:09 +0700 Subject: [PATCH 037/200] [ChangeLog] Actualize --- ChangeLog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ChangeLog b/ChangeLog index bbbf3c34d..82e82b588 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +version <unreleased> + +Extractors +- [pluralsight] Omit module title from video title (#12506) +* [pornhub] Decode obfuscated video URL (#12470, #12515) +* [senateisvp] Allow https URL scheme for embeds (#12512) + + version 2017.03.20 Core From 093dad9e256e3237dfad3c57a19ae10ddadcbf9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Mar 2017 02:36:50 +0700 Subject: [PATCH 038/200] release 2017.03.22 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 4273fedbf..31ba1de3d 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.20*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.20** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.22** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.03.20 +[debug] youtube-dl version 2017.03.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 82e82b588..dc5acbca9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.03.22 Extractors - [pluralsight] Omit module title from video title (#12506) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a65f2e741..4d722873d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.03.20' +__version__ = '2017.03.22' From c183e14f89078593ab47f06e5076f00bee3c9dd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Mar 2017 22:26:59 +0700 Subject: [PATCH 039/200] [viu] Relax _VALID_URL (closes #12529) --- youtube_dl/extractor/viu.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index 3fd889c8e..db6a65d2e 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -44,7 +44,7 @@ class ViuBaseIE(InfoExtractor): class ViuIE(ViuBaseIE): - _VALID_URL = r'(?:viu:|https?://www\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)' + _VALID_URL = r'(?:viu:|https?://[^/]+\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059', 'info_dict': { @@ -69,6 +69,9 @@ class ViuIE(ViuBaseIE): 'skip_download': 'm3u8 download', }, 'skip': 'Geo-restricted to Indonesia', + }, { + 'url': 'https://india.viu.com/en/media/1126286865', + 'only_matching': True, }] def _real_extract(self, url): From 391d076d7cf037b1d7849ea7cbbdd04950c46f96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Mar 2017 23:22:14 +0700 Subject: [PATCH 040/200] [condenast] Fix extraction and style (closes #12526) --- youtube_dl/extractor/condenast.py | 43 +++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 8d8f60598..d3463b874 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -9,13 +9,14 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( - orderedSet, - remove_end, - extract_attributes, - mimetype2ext, determine_ext, + extract_attributes, int_or_none, + js_to_json, + mimetype2ext, + orderedSet, parse_iso8601, + remove_end, ) @@ -66,6 +67,16 @@ class CondeNastIE(InfoExtractor): 'upload_date': '20130314', 'timestamp': 1363219200, } + }, { + 'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series', + 'info_dict': { + 'id': '58d1865bfd2e6126e2000015', + 'ext': 'mp4', + 'title': 'The Only True Surprise? Trump’s an Idiot', + 'uploader': 'gq', + 'upload_date': '20170321', + 'timestamp': 1490126427, + }, }, { # JS embed 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js', @@ -114,26 +125,33 @@ class CondeNastIE(InfoExtractor): }) video_id = query['videoId'] video_info = None - info_page = self._download_webpage( + info_page = self._download_json( 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', query=query, fatal=False) + video_id, 'Downloading video info', fatal=False, query=query) if info_page: - video_info = self._parse_json(self._search_regex( - r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video'] - else: + video_info = info_page.get('video') + if not video_info: info_page = self._download_webpage( 'http://player.cnevids.com/player/loader.js', video_id, 'Downloading loader info', query=query) - video_info = self._parse_json(self._search_regex( - r'var\s+video\s*=\s*({.+?});', info_page, 'video info'), video_id) + video_info = self._parse_json( + self._search_regex( + r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), + video_id, transform_source=js_to_json)['video'] + title = video_info['title'] formats = [] - for fdata in video_info.get('sources', [{}])[0]: + for fdata in video_info['sources']: src = fdata.get('src') if not src: continue ext = mimetype2ext(fdata.get('type')) or determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue quality = fdata.get('quality') formats.append({ 'format_id': ext + ('-%s' % quality if quality else ''), @@ -169,7 +187,6 @@ class CondeNastIE(InfoExtractor): path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/'))) url_type = 'embed' - self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site]) webpage = self._download_webpage(url, item_id) if url_type == 'series': From ca5ed022e962ecd6992c145ac7bc00b5963e5d69 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 22 Mar 2017 17:28:24 +0100 Subject: [PATCH 041/200] [hbo] add support for free episode urls and new formats extraction(closes #12519) --- youtube_dl/extractor/hbo.py | 45 ++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py index 8116ad9bd..931f71a5a 100644 --- a/youtube_dl/extractor/hbo.py +++ b/youtube_dl/extractor/hbo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( xpath_text, xpath_element, @@ -14,14 +15,26 @@ from ..utils import ( class HBOBaseIE(InfoExtractor): _FORMATS_INFO = { + 'pro7': { + 'width': 1280, + 'height': 720, + }, '1920': { 'width': 1280, 'height': 720, }, + 'pro6': { + 'width': 768, + 'height': 432, + }, '640': { 'width': 768, 'height': 432, }, + 'pro5': { + 'width': 640, + 'height': 360, + }, 'highwifi': { 'width': 640, 'height': 360, @@ -78,6 +91,17 @@ class HBOBaseIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( video_url.replace('.tar', '/base_index_w8.m3u8'), video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + elif source.tag == 'hls': + # #EXT-X-BYTERANGE is not supported by native hls downloader + # and ffmpeg (#10955) + # formats.extend(self._extract_m3u8_formats( + # video_url.replace('.tar', '/base_index.m3u8'), + # video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + continue + elif source.tag == 'dash': + formats.extend(self._extract_mpd_formats( + video_url.replace('.tar', '/manifest.mpd'), + video_id, mpd_id='dash', fatal=False)) else: format_info = self._FORMATS_INFO.get(source.tag, {}) formats.append({ @@ -112,10 +136,11 @@ class HBOBaseIE(InfoExtractor): class HBOIE(HBOBaseIE): + IE_NAME = 'hbo' _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', - 'md5': '1c33253f0c7782142c993c0ba62a8753', + 'md5': '2c6a6bc1222c7e91cb3334dad1746e5a', 'info_dict': { 'id': '1437839', 'ext': 'mp4', @@ -131,11 +156,12 @@ class HBOIE(HBOBaseIE): class HBOEpisodeIE(HBOBaseIE): - _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?!video)([^/]+/)+video/(?P<id>[0-9a-z-]+)\.html' + IE_NAME = 'hbo:episode' + _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?P<path>(?!video)(?:(?:[^/]+/)+video|watch-free-episodes)/(?P<id>[0-9a-z-]+))(?:\.html)?' _TESTS = [{ 'url': 'http://www.hbo.com/girls/episodes/5/52-i-love-you-baby/video/ep-52-inside-the-episode.html?autoplay=true', - 'md5': '689132b253cc0ab7434237fc3a293210', + 'md5': '61ead79b9c0dfa8d3d4b07ef4ac556fb', 'info_dict': { 'id': '1439518', 'display_id': 'ep-52-inside-the-episode', @@ -147,16 +173,19 @@ class HBOEpisodeIE(HBOBaseIE): }, { 'url': 'http://www.hbo.com/game-of-thrones/about/video/season-5-invitation-to-the-set.html?autoplay=true', 'only_matching': True, + }, { + 'url': 'http://www.hbo.com/watch-free-episodes/last-week-tonight-with-john-oliver', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + path, display_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) + content = self._download_json( + 'http://www.hbo.com/api/content/' + path, display_id)['content'] - video_id = self._search_regex( - r'(?P<q1>[\'"])videoId(?P=q1)\s*:\s*(?P<q2>[\'"])(?P<video_id>\d+)(?P=q2)', - webpage, 'video ID', group='video_id') + video_id = compat_str((content.get('parsed', {}).get( + 'common:FullBleedVideo', {}) or content['selectedEpisode'])['videoId']) info_dict = self._extract_from_id(video_id) info_dict['display_id'] = display_id From 579c99a284481243f30e80151c90a753f613778d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 22 Mar 2017 23:48:06 +0700 Subject: [PATCH 042/200] [cloudy] Fix extraction (closes #12525) --- youtube_dl/extractor/cloudy.py | 113 +++++++++++---------------------- 1 file changed, 36 insertions(+), 77 deletions(-) diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index ae5ba0015..9bc8dbea4 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -1,97 +1,56 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_HTTPError, -) from ..utils import ( - ExtractorError, - HEADRequest, - remove_end, + str_to_int, + unified_strdate, ) class CloudyIE(InfoExtractor): _IE_DESC = 'cloudy.ec' - _VALID_URL = r'''(?x) - https?://(?:www\.)?cloudy\.ec/ - (?:v/|embed\.php\?id=) - (?P<id>[A-Za-z0-9]+) - ''' - _EMBED_URL = 'http://www.cloudy.ec/embed.php?id=%s' - _API_URL = 'http://www.cloudy.ec/api/player.api.php' - _MAX_TRIES = 2 - _TEST = { + _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)' + _TESTS = [{ 'url': 'https://www.cloudy.ec/v/af511e2527aac', - 'md5': '5cb253ace826a42f35b4740539bedf07', + 'md5': '29832b05028ead1b58be86bf319397ca', 'info_dict': { 'id': 'af511e2527aac', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Funny Cats and Animals Compilation june 2013', + 'upload_date': '20130913', + 'view_count': int, } - } - - def _extract_video(self, video_id, file_key, error_url=None, try_num=0): - - if try_num > self._MAX_TRIES - 1: - raise ExtractorError('Unable to extract video URL', expected=True) - - form = { - 'file': video_id, - 'key': file_key, - } - - if error_url: - form.update({ - 'numOfErrors': try_num, - 'errorCode': '404', - 'errorUrl': error_url, - }) - - player_data = self._download_webpage( - self._API_URL, video_id, 'Downloading player data', query=form) - data = compat_parse_qs(player_data) - - try_num += 1 - - if 'error' in data: - raise ExtractorError( - '%s error: %s' % (self.IE_NAME, ' '.join(data['error_msg'])), - expected=True) - - title = data.get('title', [None])[0] - if title: - title = remove_end(title, '&asdasdas').strip() - - video_url = data.get('url', [None])[0] - - if video_url: - try: - self._request_webpage(HEADRequest(video_url), video_id, 'Checking video URL') - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]: - self.report_warning('Invalid video URL, requesting another', video_id) - return self._extract_video(video_id, file_key, video_url, try_num) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - } + }, { + 'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - url = self._EMBED_URL % video_id - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id) - file_key = self._search_regex( - [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'], - webpage, 'file_key') + info = self._parse_html5_media_entries(url, webpage, video_id)[0] - return self._extract_video(video_id, file_key) + webpage = self._download_webpage( + 'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False) + + if webpage: + info.update({ + 'title': self._search_regex( + r'<h\d[^>]*>([^<]+)<', webpage, 'title'), + 'upload_date': unified_strdate(self._search_regex( + r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage, + 'upload date', fatal=False)), + 'view_count': str_to_int(self._search_regex( + r'([\d,.]+) views<', webpage, 'view count', fatal=False)), + }) + + if not info.get('title'): + info['title'] = video_id + + info['id'] = video_id + + return info From b0f7f21cb92ca3af1795f68737ffa25196968dc6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 23 Mar 2017 09:22:17 +0100 Subject: [PATCH 043/200] [channel9] fix extraction(closes #11323) --- youtube_dl/extractor/channel9.py | 345 ++++++++++++------------------- 1 file changed, 127 insertions(+), 218 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 865dbcaba..b1cb58530 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -5,8 +5,10 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, - parse_filesize, - qualities, + unescapeHTML, + int_or_none, + parse_iso8601, + clean_html, ) @@ -20,46 +22,50 @@ class Channel9IE(InfoExtractor): ''' IE_DESC = 'Channel 9' IE_NAME = 'channel9' - _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' + _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' _TESTS = [{ 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', - 'md5': 'bbd75296ba47916b754e73c3a4bbdf10', + 'md5': '32083d4eaf1946db6d454313f44510ca', 'info_dict': { - 'id': 'Events/TechEd/Australia/2013/KOS002', - 'ext': 'mp4', + 'id': '6c413323-383a-49dc-88f9-a22800cab024', + 'ext': 'wmv', 'title': 'Developer Kick-Off Session: Stuff We Love', - 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', + 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731', 'duration': 4576, - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 1377717420, + 'upload_date': '20130828', 'session_code': 'KOS002', - 'session_day': 'Day 1', 'session_room': 'Arena 1A', - 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', - 'Mads Kristensen'], + 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'], }, }, { 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', - 'md5': 'b43ee4529d111bc37ba7ee4f34813e68', + 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc', 'info_dict': { - 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', - 'ext': 'mp4', + 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', + 'ext': 'wmv', 'title': 'Self-service BI with Power BI - nuclear testing', - 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', + 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54', 'duration': 1540, - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 1386381991, + 'upload_date': '20131207', 'authors': ['Mike Wilmot'], }, }, { # low quality mp4 is best 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', 'info_dict': { - 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76', 'ext': 'mp4', 'title': 'Ranges for the Standard Library', - 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', + 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372', 'duration': 5646, - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', + 'upload_date': '20150930', + 'timestamp': 1443640735, }, 'params': { 'skip_download': True, @@ -70,7 +76,7 @@ class Channel9IE(InfoExtractor): 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b', 'title': 'Channel 9', }, - 'playlist_count': 2, + 'playlist_mincount': 100, }, { 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS', 'only_matching': True, @@ -81,189 +87,6 @@ class Channel9IE(InfoExtractor): _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - def _formats_from_html(self, html): - FORMAT_REGEX = r''' - (?x) - <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* - <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* - (?:<div\s+class="popup\s+rounded">\s* - <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* - </div>)? # File size part may be missing - ''' - quality = qualities(( - 'MP3', 'MP4', - 'Low Quality WMV', 'Low Quality MP4', - 'Mid Quality WMV', 'Mid Quality MP4', - 'High Quality WMV', 'High Quality MP4')) - formats = [{ - 'url': x.group('url'), - 'format_id': x.group('quality'), - 'format_note': x.group('note'), - 'format': '%s (%s)' % (x.group('quality'), x.group('note')), - 'filesize_approx': parse_filesize(x.group('filesize')), - 'quality': quality(x.group('quality')), - 'vcodec': 'none' if x.group('note') == 'Audio only' else None, - } for x in list(re.finditer(FORMAT_REGEX, html))] - - self._sort_formats(formats) - - return formats - - def _extract_title(self, html): - title = self._html_search_meta('title', html, 'title') - if title is None: - title = self._og_search_title(html) - TITLE_SUFFIX = ' (Channel 9)' - if title is not None and title.endswith(TITLE_SUFFIX): - title = title[:-len(TITLE_SUFFIX)] - return title - - def _extract_description(self, html): - DESCRIPTION_REGEX = r'''(?sx) - <div\s+class="entry-content">\s* - <div\s+id="entry-body">\s* - (?P<description>.+?)\s* - </div>\s* - </div> - ''' - m = re.search(DESCRIPTION_REGEX, html) - if m is not None: - return m.group('description') - return self._html_search_meta('description', html, 'description') - - def _extract_duration(self, html): - m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) - return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None - - def _extract_slides(self, html): - m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html) - return m.group('slidesurl') if m is not None else None - - def _extract_zip(self, html): - m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html) - return m.group('zipurl') if m is not None else None - - def _extract_avg_rating(self, html): - m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html) - return float(m.group('avgrating')) if m is not None else 0 - - def _extract_rating_count(self, html): - m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html) - return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0 - - def _extract_view_count(self, html): - m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html) - return int(self._fix_count(m.group('viewcount'))) if m is not None else 0 - - def _extract_comment_count(self, html): - m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html) - return int(self._fix_count(m.group('commentcount'))) if m is not None else 0 - - def _fix_count(self, count): - return int(str(count).replace(',', '')) if count is not None else None - - def _extract_authors(self, html): - m = re.search(r'(?s)<li class="author">(.*?)</li>', html) - if m is None: - return None - return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1)) - - def _extract_session_code(self, html): - m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html) - return m.group('code') if m is not None else None - - def _extract_session_day(self, html): - m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html) - return m.group('day').strip() if m is not None else None - - def _extract_session_room(self, html): - m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html) - return m.group('room') if m is not None else None - - def _extract_session_speakers(self, html): - return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html) - - def _extract_content(self, html, content_path): - # Look for downloadable content - formats = self._formats_from_html(html) - slides = self._extract_slides(html) - zip_ = self._extract_zip(html) - - # Nothing to download - if len(formats) == 0 and slides is None and zip_ is None: - self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path) - return - - # Extract meta - title = self._extract_title(html) - description = self._extract_description(html) - thumbnail = self._og_search_thumbnail(html) - duration = self._extract_duration(html) - avg_rating = self._extract_avg_rating(html) - rating_count = self._extract_rating_count(html) - view_count = self._extract_view_count(html) - comment_count = self._extract_comment_count(html) - - common = { - '_type': 'video', - 'id': content_path, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'avg_rating': avg_rating, - 'rating_count': rating_count, - 'view_count': view_count, - 'comment_count': comment_count, - } - - result = [] - - if slides is not None: - d = common.copy() - d.update({'title': title + '-Slides', 'url': slides}) - result.append(d) - - if zip_ is not None: - d = common.copy() - d.update({'title': title + '-Zip', 'url': zip_}) - result.append(d) - - if len(formats) > 0: - d = common.copy() - d.update({'title': title, 'formats': formats}) - result.append(d) - - return result - - def _extract_entry_item(self, html, content_path): - contents = self._extract_content(html, content_path) - if contents is None: - return contents - - if len(contents) > 1: - raise ExtractorError('Got more than one entry') - result = contents[0] - result['authors'] = self._extract_authors(html) - - return result - - def _extract_session(self, html, content_path): - contents = self._extract_content(html, content_path) - if contents is None: - return contents - - session_meta = { - 'session_code': self._extract_session_code(html), - 'session_day': self._extract_session_day(html), - 'session_room': self._extract_session_room(html), - 'session_speakers': self._extract_session_speakers(html), - } - - for content in contents: - content.update(session_meta) - - return self.playlist_result(contents) - def _extract_list(self, video_id, rss_url=None): if not rss_url: rss_url = self._RSS_URL % video_id @@ -274,9 +97,7 @@ class Channel9IE(InfoExtractor): return self.playlist_result(entries, video_id, title_text) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - content_path = mobj.group('contentpath') - rss = mobj.group('rss') + content_path, rss = re.match(self._VALID_URL, url).groups() if rss: return self._extract_list(content_path, url) @@ -284,17 +105,105 @@ class Channel9IE(InfoExtractor): webpage = self._download_webpage( url, content_path, 'Downloading web page') - page_type = self._search_regex( - r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2', - webpage, 'page type', default=None, group='pagetype') - if page_type: - if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content - return self._extract_entry_item(webpage, content_path) - elif page_type == 'Session': # Event session page, may contain downloadable content - return self._extract_session(webpage, content_path) - elif page_type == 'Event': - return self._extract_list(content_path) + episode_data = self._search_regex( + r"data-episode='([^']+)'", webpage, 'episode data', default=None) + if episode_data: + episode_data = self._parse_json(unescapeHTML( + episode_data), content_path) + content_id = episode_data['contentId'] + is_session = '/Sessions(' in episode_data['api'] + content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + if is_session: + content_url += '?$expand=Speakers' else: - raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True) - else: # Assuming list + content_url += '?$expand=Authors' + content_data = self._download_json(content_url, content_id) + title = content_data['Title'] + + formats = [] + qualities = [ + 'VideoMP4Low', + 'VideoWMV', + 'VideoMP4Medium', + 'VideoMP4High', + 'VideoWMVHQ', + ] + for q in qualities: + q_url = content_data.get(q) + if not q_url: + continue + formats.append({ + 'format_id': q, + 'url': q_url, + }) + slides = content_data.get('Slides') + zip_file = content_data.get('ZipFile') + + if not formats and not slides and not zip_file: + raise ExtractorError( + 'None of recording, slides or zip are available for %s' % content_path) + + subtitles = {} + for caption in content_data.get('Captions', []): + caption_url = caption.get('Url') + if not caption_url: + continue + subtitles.setdefault(caption.get('Language', 'en'), []).append({ + 'url': caption_url, + 'ext': 'vtt', + }) + + common = { + 'id': content_id, + 'title': title, + 'description': clean_html(content_data.get('Description') or content_data.get('Body')), + 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'), + 'duration': int_or_none(content_data.get('MediaLengthInSeconds')), + 'timestamp': parse_iso8601(content_data.get('PublishedDate')), + 'avg_rating': int_or_none(content_data.get('Rating')), + 'rating_count': int_or_none(content_data.get('RatingCount')), + 'view_count': int_or_none(content_data.get('Views')), + 'comment_count': int_or_none(content_data.get('CommentCount')), + 'subtitles': subtitles, + } + if is_session: + speakers = [] + for s in content_data.get('Speakers', []): + speaker_name = s.get('FullName') + if not speaker_name: + continue + speakers.append(speaker_name) + + common.update({ + 'session_code': content_data.get('Code'), + 'session_room': content_data.get('Room'), + 'session_speakers': speakers, + }) + else: + authors = [] + for a in content_data.get('Authors', []): + author_name = a.get('DisplayName') + if not author_name: + continue + authors.append(author_name) + common['authors'] = authors + + contents = [] + + if slides: + d = common.copy() + d.update({'title': title + '-Slides', 'url': slides}) + contents.append(d) + + if zip_file: + d = common.copy() + d.update({'title': title + '-Zip', 'url': zip_file}) + contents.append(d) + + if formats: + d = common.copy() + d.update({'title': title, 'formats': formats}) + contents.append(d) + return self.playlist_result(contents) + else: return self._extract_list(content_path) From 52d5ecabd518db46fc02b8624b2ad04ba7cf2114 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 23 Mar 2017 13:48:32 +0100 Subject: [PATCH 044/200] [bellmedia] add support for etalk.ca(closes #12447) --- youtube_dl/extractor/bellmedia.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 1f5b6ed92..8820a3914 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -21,10 +21,11 @@ class BellMediaIE(InfoExtractor): animalplanet| bravo| mtv| - space + space| + etalk )\.ca| much\.com - )/.*?(?:\bvid=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' + )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' _TESTS = [{ 'url': 'http://www.ctv.ca/video/player?vid=706966', 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', @@ -58,6 +59,9 @@ class BellMediaIE(InfoExtractor): }, { 'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430', 'only_matching': True, + }, { + 'url': 'http://www.etalk.ca/video?videoid=663455', + 'only_matching': True, }] _DOMAINS = { 'thecomedynetwork': 'comedy', @@ -65,6 +69,7 @@ class BellMediaIE(InfoExtractor): 'sciencechannel': 'discsci', 'investigationdiscovery': 'invdisc', 'animalplanet': 'aniplan', + 'etalk': 'ctv', } def _real_extract(self, url): From d0572557c2a88e34d85715af4271e8b5decbdfdb Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 23 Mar 2017 13:52:07 +0100 Subject: [PATCH 045/200] [ninecninemedia] remove mp4 url extraction request --- youtube_dl/extractor/ninecninemedia.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index d9943fc2c..8961309fd 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -34,12 +34,6 @@ class NineCNineMediaStackIE(NineCNineMediaBaseIE): formats.extend(self._extract_f4m_formats( stack_base_url + 'f4m', stack_id, f4m_id='hds', fatal=False)) - mp4_url = self._download_webpage(stack_base_url + 'pd', stack_id, fatal=False) - if mp4_url: - formats.append({ - 'url': mp4_url, - 'format_id': 'mp4', - }) self._sort_formats(formats) return { From a5d783f525a8d4b62777434607c7f1efc5c34ece Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Mar 2017 23:47:43 +0700 Subject: [PATCH 046/200] [channel9] Extract more formats --- youtube_dl/extractor/channel9.py | 76 +++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index b1cb58530..717e4eb3b 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -9,6 +9,7 @@ from ..utils import ( int_or_none, parse_iso8601, clean_html, + qualities, ) @@ -120,22 +121,75 @@ class Channel9IE(InfoExtractor): content_data = self._download_json(content_url, content_id) title = content_data['Title'] + QUALITIES = ( + 'mp3', + 'wmv', 'mp4', + 'wmv-low', 'mp4-low', + 'wmv-mid', 'mp4-mid', + 'wmv-high', 'mp4-high', + ) + + quality_key = qualities(QUALITIES) + + def quality(quality_id, format_url): + return (len(QUALITIES) if '_Source.' in format_url + else quality_key(quality_id)) + formats = [] - qualities = [ - 'VideoMP4Low', - 'VideoWMV', - 'VideoMP4Medium', - 'VideoMP4High', - 'VideoWMVHQ', - ] - for q in qualities: - q_url = content_data.get(q) - if not q_url: + urls = set() + + SITE_QUALITIES = { + 'MP3': 'mp3', + 'MP4': 'mp4', + 'Low Quality WMV': 'wmv-low', + 'Low Quality MP4': 'mp4-low', + 'Mid Quality WMV': 'wmv-mid', + 'Mid Quality MP4': 'mp4-mid', + 'High Quality WMV': 'wmv-high', + 'High Quality MP4': 'mp4-high', + } + + formats_select = self._search_regex( + r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage, + 'formats select', default=None) + if formats_select: + for mobj in re.finditer( + r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<', + formats_select): + format_url = mobj.group('url') + if format_url in urls: + continue + urls.add(format_url) + format_id = mobj.group('format') + quality_id = SITE_QUALITIES.get(format_id, format_id) + formats.append({ + 'url': format_url, + 'format_id': quality_id, + 'quality': quality(quality_id, format_url), + 'vcodec': 'none' if quality_id == 'mp3' else None, + }) + + API_QUALITIES = { + 'VideoMP4Low': 'mp4-low', + 'VideoWMV': 'wmv-mid', + 'VideoMP4Medium': 'mp4-mid', + 'VideoMP4High': 'mp4-high', + 'VideoWMVHQ': 'wmv-hq', + } + + for format_id, q in API_QUALITIES.items(): + q_url = content_data.get(format_id) + if not q_url or q_url in urls: continue + urls.add(q_url) formats.append({ - 'format_id': q, 'url': q_url, + 'format_id': q, + 'quality': quality(q, q_url), }) + + self._sort_formats(formats) + slides = content_data.get('Slides') zip_file = content_data.get('ZipFile') From bea7af694748f3d731ab4340539251f2daf5cc10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 23 Mar 2017 23:58:12 +0700 Subject: [PATCH 047/200] [channel9] Remove expired comment and sort imports --- youtube_dl/extractor/channel9.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 717e4eb3b..e92894246 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -4,23 +4,16 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, ExtractorError, - unescapeHTML, int_or_none, parse_iso8601, - clean_html, qualities, + unescapeHTML, ) class Channel9IE(InfoExtractor): - ''' - Common extractor for channel9.msdn.com. - - The type of provided URL (video or playlist) is determined according to - meta Search.PageType from web page HTML rather than URL itself, as it is - not always possible to do. - ''' IE_DESC = 'Channel 9' IE_NAME = 'channel9' _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' From 7963b6cba8d020d5553bb98aee1d098870f78f42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 Mar 2017 00:19:58 +0700 Subject: [PATCH 048/200] [ChangeLog] Actualize --- ChangeLog | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ChangeLog b/ChangeLog index dc5acbca9..2df64ea73 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +version <unreleased> + +Extractors +- [9c9media] Remove mp4 URL extraction request ++ [bellmedia] Add support for etalk.ca and space.ca (#12447) +* [channel9] Fix extraction (#11323) +* [cloudy] Fix extraction (#12525) ++ [hbo] Add support for free episode URLs and new formats extraction (#12519) +* [condenast] Fix extraction and style (#12526) +* [viu] Relax URL regular expression (#12529) + + version 2017.03.22 Extractors From a3ccd6bd11454b9760ef2c5f09f02f3afdb11af5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 Mar 2017 00:24:23 +0700 Subject: [PATCH 049/200] release 2017.03.24 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 4 ++-- youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 31ba1de3d..dfff41d2d 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.22** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.24** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.03.22 +[debug] youtube-dl version 2017.03.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 2df64ea73..78377dcb4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.03.24 Extractors - [9c9media] Remove mp4 URL extraction request diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6a7be28cb..7c99ba3c2 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -312,8 +312,8 @@ - **GPUTechConf** - **Groupon** - **Hark** - - **HBO** - - **HBOEpisode** + - **hbo** + - **hbo:episode** - **HearThisAt** - **Heise** - **HellPorno** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4d722873d..13904c724 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.03.22' +__version__ = '2017.03.24' From 54b960f340ed5398136ef0206d17cafba2575678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 24 Mar 2017 00:45:24 +0700 Subject: [PATCH 050/200] [generic] Do not follow redirects to the same URL --- youtube_dl/extractor/generic.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cb6308d29..da9d04efc 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2635,11 +2635,14 @@ class GenericIE(InfoExtractor): found = re.search(REDIRECT_REGEX, refresh_header) if found: new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) - self.report_following_redirect(new_url) - return { - '_type': 'url', - 'url': new_url, - } + if new_url != url: + self.report_following_redirect(new_url) + return { + '_type': 'url', + 'url': new_url, + } + else: + found = None if not found: # twitter:player is a https URL to iframe player that may or may not From d0ba55871e6754fdc8a6a28543581989ba3c50fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 25 Mar 2017 01:17:17 +0700 Subject: [PATCH 051/200] [youtube] Improve _VALID_URLs (closes #12538) --- youtube_dl/extractor/youtube.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index caa048249..ca40de522 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -59,6 +59,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}' + def _set_language(self): self._set_cookie( '.youtube.com', 'PREF', 'f1=50000000&hl=en', @@ -265,9 +267,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist=) # combined list/video URLs are handled by the playlist IE + (?!.*?\blist= + (?: + %(playlist_id)s| # combined list/video URLs are handled by the playlist IE + WL # WL are handled by the watch later IE + ) + ) (?(1).+)? # if we found the ID, everything can follow - $""" + $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -924,6 +931,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'sJL6WA-aGkQ', 'only_matching': True, }, + { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }, ] def __init__(self, *args, **kwargs): @@ -1864,8 +1875,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): ) .* | - ((?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}) - )""" + (%(playlist_id)s) + )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' IE_NAME = 'youtube:playlist' From 31a1214076ff41efbaa7f7243565da830d1e2c7e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 25 Mar 2017 07:03:13 +0100 Subject: [PATCH 052/200] [franceculture] fix extraction(closes #12547) --- youtube_dl/extractor/franceculture.py | 31 ++++++++++++++++----------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index b98da692c..df3d757f3 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, + int_or_none, unified_strdate, ) @@ -19,6 +21,7 @@ class FranceCultureIE(InfoExtractor): 'title': 'Rendez-vous au pays des geeks', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', + 'timestamp': 1393642916, 'vcodec': 'none', } } @@ -28,30 +31,34 @@ class FranceCultureIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - video_url = self._search_regex( - r'(?s)<div[^>]+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?<button[^>]+data-asset-source="([^"]+)"', - webpage, 'video path') + video_data = extract_attributes(self._search_regex( + r'(?s)<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>.*?(<button[^>]+data-asset-source="[^"]+"[^>]+>)', + webpage, 'video data')) - title = self._og_search_title(webpage) + video_url = video_data['data-asset-source'] + title = video_data.get('data-asset-title') or self._og_search_title(webpage) - upload_date = unified_strdate(self._search_regex( - '(?s)<div[^>]+class="date"[^>]*>.*?<span[^>]+class="inner"[^>]*>([^<]+)<', - webpage, 'upload date', fatal=False)) + description = self._html_search_regex( + r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', + webpage, 'description', default=None) thumbnail = self._search_regex( - r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+data-dejavu-src="([^"]+)"', + r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"', webpage, 'thumbnail', fatal=False) uploader = self._html_search_regex( - r'(?s)<div id="emission".*?<span class="author">(.*?)</span>', + r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None) - vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None + ext = determine_ext(video_url.lower()) return { 'id': display_id, 'display_id': display_id, 'url': video_url, 'title': title, + 'description': description, 'thumbnail': thumbnail, - 'vcodec': vcodec, + 'ext': ext, + 'vcodec': 'none' if ext == 'mp3' else None, 'uploader': uploader, - 'upload_date': upload_date, + 'timestamp': int_or_none(video_data.get('data-asset-created-date')), + 'duration': int_or_none(video_data.get('data-duration')), } From 1088d76da6cbc83d64faca5a1a987944af04b0ce Mon Sep 17 00:00:00 2001 From: zurfyx <zurfyx@gmail.com> Date: Sat, 25 Mar 2017 00:45:32 +0100 Subject: [PATCH 053/200] [atresplayer] Fix login error detection --- youtube_dl/extractor/atresplayer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index e3c669830..23a536ff2 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -90,7 +90,8 @@ class AtresPlayerIE(InfoExtractor): request, None, 'Logging in as %s' % username) error = self._html_search_regex( - r'(?s)<ul class="list_error">(.+?)</ul>', response, 'error', default=None) + r'(?s)<ul[^>]+class="[^"]*\blist_error\b[^"]*">(.+?)</ul>', + response, 'error', default=None) if error: raise ExtractorError( 'Unable to login: %s' % error, expected=True) From 048086920bdb92cc5d63847e9e1d2fd645910363 Mon Sep 17 00:00:00 2001 From: zurfyx <zurfyx@gmail.com> Date: Sat, 25 Mar 2017 01:08:47 +0100 Subject: [PATCH 054/200] [atresplayer] Extract HD manifest --- youtube_dl/extractor/atresplayer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 23a536ff2..940c548f4 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -161,7 +161,8 @@ class AtresPlayerIE(InfoExtractor): # this videos are protected by DRM, the f4m downloader doesn't support them continue else: - f4m_url = video_url[:-9] + '/manifest.f4m' + video_url_hd = video_url.replace('free_es', 'es') + f4m_url = video_url_hd[:-9] + '/manifest.f4m' formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) From c7301e677bddb5d676ebf207a3ac485fce330057 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 25 Mar 2017 18:03:46 +0700 Subject: [PATCH 055/200] [atresplayer] Extract DASH and ISM formats --- youtube_dl/extractor/atresplayer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 940c548f4..ffac9df0e 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -160,10 +160,15 @@ class AtresPlayerIE(InfoExtractor): f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) # this videos are protected by DRM, the f4m downloader doesn't support them continue - else: - video_url_hd = video_url.replace('free_es', 'es') - f4m_url = video_url_hd[:-9] + '/manifest.f4m' - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) + video_url_hd = video_url.replace('free_es', 'es') + formats.extend(self._extract_f4m_formats( + video_url_hd[:-9] + '/manifest.f4m', video_id, f4m_id='hds', + fatal=False)) + formats.extend(self._extract_mpd_formats( + video_url_hd[:-9] + '/manifest.mpd', video_id, mpd_id='dash', + fatal=False)) + formats.extend(self._extract_ism_formats( + video_url_hd, video_id, ism_id='mss', fatal=False)) self._sort_formats(formats) path_data = player.get('pathData') From e8e4cc5a6a3ad8bf94d9ff9e5bb2d72712e14c34 Mon Sep 17 00:00:00 2001 From: John Hawkinson <jhawk@mit.edu> Date: Sun, 19 Mar 2017 20:52:25 -0400 Subject: [PATCH 056/200] [generic] Replace LazyYT test with skiplagged discourse.ubuntu.com has gone away, repalce with skiplagged.com. Be nice to have a non-frontpage URL that might be more stable, though I don't have one. Maybe this should move to html in test/test_InfoExtractor.py? --- youtube_dl/extractor/generic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index da9d04efc..4fff93efe 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -902,12 +902,13 @@ class GenericIE(InfoExtractor): }, # LazyYT { - 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', + 'url': 'https://skiplagged.com/', 'info_dict': { - 'id': '1986', - 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', + 'id': 'skiplagged', + 'title': 'Skiplagged: The smart way to find cheap flights', }, - 'playlist_mincount': 2, + 'playlist_mincount': 1, + 'add_ie': ['Youtube'], }, # Cinchcast embed { From 7aa0ee321b4095da7a2430f383bea773115e1491 Mon Sep 17 00:00:00 2001 From: gkoelln <gkoelln7@gmail.com> Date: Sat, 25 Mar 2017 08:12:25 -0500 Subject: [PATCH 057/200] [fox] Add metadata extraction Add series, season number, episode number and episode. --- youtube_dl/extractor/fox.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 9f2e5d065..cc5d62ebc 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( + int_or_none, smuggle_url, update_url_query, ) @@ -47,9 +48,12 @@ class FOXIE(AdobePassIE): resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating) query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource) - return { + info = self._search_json_ld(webpage, video_id, fatal=False) + info.update({ '_type': 'url_transparent', 'ie_key': 'ThePlatform', 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), 'id': video_id, - } + }) + + return info From d97729c83a747f48d83f4aba9b85d2a14a58b8b7 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 25 Mar 2017 14:28:53 +0100 Subject: [PATCH 058/200] [fox] remove unused import --- youtube_dl/extractor/fox.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index cc5d62ebc..159fdf9c4 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - int_or_none, smuggle_url, update_url_query, ) From c6c22e984d0d35172d8e39d2136d2059494d22b2 Mon Sep 17 00:00:00 2001 From: John Hawkinson <jhawk@mit.edu> Date: Sat, 25 Mar 2017 10:36:40 -0400 Subject: [PATCH 059/200] [test_download] Print additional IEs in summary output --- test/test_download.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/test_download.py b/test/test_download.py index 30034f978..01a8bcb89 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -71,6 +71,18 @@ class TestDownload(unittest.TestCase): maxDiff = None + def __str__(self): + """Identify each test with the `add_ie` attribute, if available.""" + + def strclass(cls): + """From 2.7's unittest; 2.6 had _strclass so we can't import it.""" + return '%s.%s' % (cls.__module__, cls.__name__) + + add_ie = getattr(self, self._testMethodName).add_ie + return '%s (%s)%s:' % (self._testMethodName, + strclass(self.__class__), + ' [%s]' % add_ie if add_ie else '') + def setUp(self): self.defs = defs @@ -233,6 +245,8 @@ for n, test_case in enumerate(defs): i += 1 test_method = generator(test_case, tname) test_method.__name__ = str(tname) + ie_list = test_case.get('add_ie') + test_method.add_ie = ie_list and ','.join(ie_list) setattr(TestDownload, test_method.__name__, test_method) del test_method From 610a6d10538d8ecab8e51dc083f02adbd09f706f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 25 Mar 2017 21:40:28 +0700 Subject: [PATCH 060/200] [atresplayer] Do not extract ISM formats As per @remitamine: the ISM downloader does not support videos served from wowza servers(it will produce broken files) --- youtube_dl/extractor/atresplayer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index ffac9df0e..bfda1e24e 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -167,8 +167,6 @@ class AtresPlayerIE(InfoExtractor): formats.extend(self._extract_mpd_formats( video_url_hd[:-9] + '/manifest.mpd', video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_ism_formats( - video_url_hd, video_id, ism_id='mss', fatal=False)) self._sort_formats(formats) path_data = player.get('pathData') From d66d43c5547daf4fc1a269824a8432477fbb099d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 25 Mar 2017 18:13:13 +0100 Subject: [PATCH 061/200] [atvat] Add new extractor(closes #5325) --- youtube_dl/extractor/atvat.py | 73 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 74 insertions(+) create mode 100644 youtube_dl/extractor/atvat.py diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py new file mode 100644 index 000000000..1584d53fc --- /dev/null +++ b/youtube_dl/extractor/atvat.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + unescapeHTML, +) + + +class ATVAtIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P<id>[dv]\d+)' + _TESTS = [{ + 'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/', + 'md5': 'c3b6b975fb3150fc628572939df205f2', + 'info_dict': { + 'id': '1698447', + 'ext': 'mp4', + 'title': 'DI, 21.03.17 | 20:05 Uhr 1/1', + } + }, { + 'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_data = self._parse_json(unescapeHTML(self._search_regex( + r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="([^"]+)"', + webpage, 'player data')), display_id)['config']['initial_video'] + + video_id = video_data['id'] + video_title = video_data['title'] + + parts = [] + for part in video_data.get('parts', []): + part_id = part['id'] + part_title = part['title'] + + formats = [] + for source in part.get('sources', []): + source_url = source.get('src') + if not source_url: + continue + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, part_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': source.get('delivery'), + 'url': source_url, + }) + self._sort_formats(formats) + + parts.append({ + 'id': part_id, + 'title': part_title, + 'thumbnail': part.get('preview_image_url'), + 'duration': int_or_none(part.get('duration')), + 'is_live': part.get('is_livestream'), + 'formats': formats, + }) + + return { + '_type': 'multi_video', + 'id': video_id, + 'title': video_title, + 'entries': parts, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 40a5c9842..6a7028a4d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -71,6 +71,7 @@ from .arte import ( ) from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE +from .atvat import ATVAtIE from .audimedia import AudiMediaIE from .audioboom import AudioBoomIE from .audiomack import AudiomackIE, AudiomackAlbumIE From 51ef4919dfd51b5bd562f39f865a117f9a5cd304 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 6 Mar 2017 00:31:44 +0800 Subject: [PATCH 062/200] [afreecatv] Fix extraction (closes #12179) --- ChangeLog | 6 +++ youtube_dl/extractor/afreecatv.py | 72 +++++++++++++------------------ 2 files changed, 36 insertions(+), 42 deletions(-) diff --git a/ChangeLog b/ChangeLog index 78377dcb4..45d6f244d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [afreecatv] Fix extraction (#12179) + + version 2017.03.24 Extractors diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index e0a0f7c57..b774d6db8 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -4,15 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_urlparse, -) +from ..compat import compat_xpath from ..utils import ( ExtractorError, int_or_none, - update_url_query, - xpath_element, xpath_text, ) @@ -43,7 +38,8 @@ class AfreecaTVIE(InfoExtractor): 'uploader': 'dailyapril', 'uploader_id': 'dailyapril', 'upload_date': '20160503', - } + }, + 'skip': 'Video is gone', }, { 'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867', 'info_dict': { @@ -71,6 +67,19 @@ class AfreecaTVIE(InfoExtractor): 'upload_date': '20160502', }, }], + 'skip': 'Video is gone', + }, { + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', + 'info_dict': { + 'id': '18650793', + 'ext': 'flv', + 'uploader': '윈아디', + 'uploader_id': 'badkids', + 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', 'only_matching': True, @@ -90,40 +99,33 @@ class AfreecaTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - parsed_url = compat_urllib_parse_urlparse(url) - info_url = compat_urlparse.urlunparse(parsed_url._replace( - netloc='afbbs.afreecatv.com:8080', - path='/api/video/get_video_info.php')) video_xml = self._download_xml( - update_url_query(info_url, {'nTitleNo': video_id}), video_id) + 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', + video_id, query={'nTitleNo': video_id}) - if xpath_element(video_xml, './track/video/file') is None: + video_element = video_xml.findall(compat_xpath('./track/video'))[1] + if video_element is None or video_element.text is None: raise ExtractorError('Specified AfreecaTV video does not exist', expected=True) - title = xpath_text(video_xml, './track/title', 'title') + video_url_raw = video_element.text + + app, playpath = video_url_raw.split('mp4:') + + title = xpath_text(video_xml, './track/title', 'title', fatal=True) uploader = xpath_text(video_xml, './track/nickname', 'uploader') uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id') duration = int_or_none(xpath_text(video_xml, './track/duration', 'duration')) thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail') - entries = [] - for i, video_file in enumerate(video_xml.findall('./track/video/file')): - video_key = self.parse_video_key(video_file.get('key', '')) - if not video_key: - continue - entries.append({ - 'id': '%s_%s' % (video_id, video_key.get('part', i + 1)), - 'title': title, - 'upload_date': video_key.get('upload_date'), - 'duration': int_or_none(video_file.get('duration')), - 'url': video_file.text, - }) - - info = { + return { 'id': video_id, + 'url': app, + 'ext': 'flv', + 'play_path': 'mp4:' + playpath, + 'rtmp_live': True, # downloading won't end without this 'title': title, 'uploader': uploader, 'uploader_id': uploader_id, @@ -131,20 +133,6 @@ class AfreecaTVIE(InfoExtractor): 'thumbnail': thumbnail, } - if len(entries) > 1: - info['_type'] = 'multi_video' - info['entries'] = entries - elif len(entries) == 1: - info['url'] = entries[0]['url'] - info['upload_date'] = entries[0].get('upload_date') - else: - raise ExtractorError( - 'No files found for the specified AfreecaTV video, either' - ' the URL is incorrect or the video has been made private.', - expected=True) - - return info - class AfreecaTVGlobalIE(AfreecaTVIE): IE_NAME = 'afreecatv:global' From 03486dbb0133e42074c272f60e24f18c856fdf0d Mon Sep 17 00:00:00 2001 From: Random User <rndusr@posteo.de> Date: Sat, 25 Mar 2017 19:37:45 +0100 Subject: [PATCH 063/200] Add test for JWPlayer where config is passed as variable --- youtube_dl/extractor/generic.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9868ca6d0..c8c103ae3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -972,6 +972,20 @@ class GenericIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, + { + # JWPlayer config passed as variable + 'url': 'http://www.txxx.com/videos/3326530/ariele/', + 'info_dict': { + 'id': '3326530_hq', + 'ext': 'mp4', + 'title': 'ARIELE | Tube Cup', + 'uploader': 'www.txxx.com', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + } + }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', From fb4fc44928d042a33287fd3e8e18b721c29ff8e8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 25 Mar 2017 19:37:54 +0100 Subject: [PATCH 064/200] [downloader/hls] immediately delegate downloading to ffmpeg in case live stream --- youtube_dl/downloader/hls.py | 21 +++++++++++++-------- youtube_dl/extractor/arkena.py | 3 +-- youtube_dl/extractor/ceskatelevize.py | 3 +-- youtube_dl/extractor/eyedotv.py | 2 +- youtube_dl/extractor/freshlive.py | 5 ++--- youtube_dl/extractor/livestream.py | 15 ++++++++------- youtube_dl/extractor/vk.py | 3 +-- 7 files changed, 27 insertions(+), 25 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 4989abce1..7534e4da5 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -30,6 +30,15 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' + def _delegate_to_ffmpeg(self, filename, info_dict): + self.report_warning( + 'hlsnative has detected features it does not support, ' + 'extraction will be delegated to ffmpeg') + fd = FFmpegFD(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + return fd.real_download(filename, info_dict) + @staticmethod def can_download(manifest, info_dict): UNSUPPORTED_FEATURES = ( @@ -53,10 +62,12 @@ class HlsFD(FragmentFD): ) check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest) - check_results.append(not info_dict.get('is_live')) return all(check_results) def real_download(self, filename, info_dict): + if info_dict.get('is_live'): + return self._delegate_to_ffmpeg(filename, info_dict) + man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) @@ -68,13 +79,7 @@ class HlsFD(FragmentFD): if info_dict.get('extra_param_to_segment_url'): self.report_error('pycrypto not found. Please install it.') return False - self.report_warning( - 'hlsnative has detected features it does not support, ' - 'extraction will be delegated to ffmpeg') - fd = FFmpegFD(self.ydl, self.params) - for ph in self._progress_hooks: - fd.add_progress_hook(ph) - return fd.real_download(filename, info_dict) + return self._delegate_to_ffmpeg(filename, info_dict) total_frags = 0 for line in s.splitlines(): diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py index 50ffb442d..4495ddbb0 100644 --- a/youtube_dl/extractor/arkena.py +++ b/youtube_dl/extractor/arkena.py @@ -93,8 +93,7 @@ class ArkenaIE(InfoExtractor): exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None)) if kind == 'm3u8' or 'm3u8' in exts: formats.extend(self._extract_m3u8_formats( - f_url, video_id, 'mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', + f_url, video_id, 'mp4', 'm3u8_native', m3u8_id=kind, fatal=False, live=is_live)) elif kind == 'flash' or 'f4m' in exts: formats.extend(self._extract_f4m_formats( diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index b1dfacf80..dd2529a6d 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -160,8 +160,7 @@ class CeskaTelevizeIE(InfoExtractor): for format_id, stream_url in item.get('streamUrls', {}).items(): if 'playerType=flash' in stream_url: stream_formats = self._extract_m3u8_formats( - stream_url, playlist_id, 'mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', + stream_url, playlist_id, 'mp4', 'm3u8_native', m3u8_id='hls-%s' % format_id, fatal=False) else: stream_formats = self._extract_mpd_formats( diff --git a/youtube_dl/extractor/eyedotv.py b/youtube_dl/extractor/eyedotv.py index 2f3035147..f62ddebae 100644 --- a/youtube_dl/extractor/eyedotv.py +++ b/youtube_dl/extractor/eyedotv.py @@ -54,7 +54,7 @@ class EyedoTVIE(InfoExtractor): 'id': video_id, 'title': title, 'formats': self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native'), + m3u8_url, video_id, 'mp4', 'm3u8_native'), 'description': xpath_text(video_data, _add_ns('Description')), 'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))), 'uploader': xpath_text(video_data, _add_ns('Createur')), diff --git a/youtube_dl/extractor/freshlive.py b/youtube_dl/extractor/freshlive.py index a90f9156c..72a845945 100644 --- a/youtube_dl/extractor/freshlive.py +++ b/youtube_dl/extractor/freshlive.py @@ -56,9 +56,8 @@ class FreshLiveIE(InfoExtractor): is_live = info.get('liveStreamUrl') is not None formats = self._extract_m3u8_formats( - stream_url, video_id, ext='mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', - m3u8_id='hls') + stream_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls') if is_live: title = self._live_title(title) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index c863413bf..7f946c6ed 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -119,7 +119,8 @@ class LivestreamIE(InfoExtractor): m3u8_url = video_data.get('m3u8_url') if m3u8_url: formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) f4m_url = video_data.get('f4m_url') if f4m_url: @@ -158,11 +159,11 @@ class LivestreamIE(InfoExtractor): if smil_url: formats.extend(self._extract_smil_formats(smil_url, broadcast_id)) - entry_protocol = 'm3u8' if is_live else 'm3u8_native' m3u8_url = stream_info.get('m3u8_url') if m3u8_url: formats.extend(self._extract_m3u8_formats( - m3u8_url, broadcast_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False)) + m3u8_url, broadcast_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) rtsp_url = stream_info.get('rtsp_url') if rtsp_url: @@ -276,7 +277,7 @@ class LivestreamOriginalIE(InfoExtractor): 'view_count': view_count, } - def _extract_video_formats(self, video_data, video_id, entry_protocol): + def _extract_video_formats(self, video_data, video_id): formats = [] progressive_url = video_data.get('progressiveUrl') @@ -289,7 +290,8 @@ class LivestreamOriginalIE(InfoExtractor): m3u8_url = video_data.get('httpUrl') if m3u8_url: formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False)) + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) rtsp_url = video_data.get('rtspUrl') if rtsp_url: @@ -340,11 +342,10 @@ class LivestreamOriginalIE(InfoExtractor): } video_data = self._download_json(stream_url, content_id) is_live = video_data.get('isLive') - entry_protocol = 'm3u8' if is_live else 'm3u8_native' info.update({ 'id': content_id, 'title': self._live_title(info['title']) if is_live else info['title'], - 'formats': self._extract_video_formats(video_data, content_id, entry_protocol), + 'formats': self._extract_video_formats(video_data, content_id), 'is_live': is_live, }) return info diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 7c42a4f54..dc2719cf9 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -432,8 +432,7 @@ class VKIE(VKBaseIE): }) elif format_id == 'hls': formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', + format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False, live=is_live)) elif format_id == 'rtmp': formats.append({ From c73e330e7adc9c0c15ac51aeea8fbb7dad95351a Mon Sep 17 00:00:00 2001 From: Random User <rndusr@posteo.de> Date: Sat, 25 Mar 2017 19:38:30 +0100 Subject: [PATCH 065/200] _find_jwplayer_data() returns dict or None This simplifies code for callers of `_find_jwplayer_data()` which no longer have to run `_parse_json()` on the return value. It also makes sure that `_find_jwplayer_data()` returns either a `dict` or `None` and nothing else. --- youtube_dl/extractor/common.py | 18 ++++++++++++------ youtube_dl/extractor/generic.py | 12 ++++-------- youtube_dl/extractor/tvnoe.py | 5 ++--- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb3c091aa..c2ca73ee1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2161,18 +2161,24 @@ class InfoExtractor(object): }) return formats - @staticmethod - def _find_jwplayer_data(webpage): + def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)', webpage) if mobj: - return mobj.group('options') + try: + jwplayer_data = self._parse_json(mobj.group('options'), + video_id=video_id, + transform_source=transform_source) + except ExtractorError: + pass + else: + if isinstance(jwplayer_data, dict): + return jwplayer_data def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._parse_json( - self._find_jwplayer_data(webpage), video_id, - transform_source=js_to_json) + jwplayer_data = self._find_jwplayer_data( + webpage, video_id, transform_source=js_to_json) return self._parse_jwplayer_data( jwplayer_data, video_id, *args, **kwargs) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c8c103ae3..3fe0237b6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2518,14 +2518,10 @@ class GenericIE(InfoExtractor): self._sort_formats(entry['formats']) return self.playlist_result(entries) - jwplayer_data_str = self._find_jwplayer_data(webpage) - if jwplayer_data_str: - try: - jwplayer_data = self._parse_json( - jwplayer_data_str, video_id, transform_source=js_to_json) - return self._parse_jwplayer_data(jwplayer_data, video_id) - except ExtractorError: - pass + jwplayer_data = self._find_jwplayer_data( + webpage, video_id, transform_source=js_to_json) + if jwplayer_data: + return self._parse_jwplayer_data(jwplayer_data, video_id) def check_video(vurl): if YoutubeIE.suitable(vurl): diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py index 1a5b76bf2..26a5aeae4 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/youtube_dl/extractor/tvnoe.py @@ -31,9 +31,8 @@ class TVNoeIE(InfoExtractor): r'<iframe[^>]+src="([^"]+)"', webpage, 'iframe URL') ifs_page = self._download_webpage(iframe_url, video_id) - jwplayer_data = self._parse_json( - self._find_jwplayer_data(ifs_page), - video_id, transform_source=js_to_json) + jwplayer_data = self._find_jwplayer_data( + ifs_page, video_id, transform_source=js_to_json) info_dict = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=iframe_url) From 51098426b83a8ebce4b0c08e869ce023232089fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Mar 2017 02:30:10 +0700 Subject: [PATCH 066/200] [utils] Introduce expand_path --- test/test_utils.py | 10 ++++++++++ youtube_dl/utils.py | 6 ++++++ 2 files changed, 16 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 173c49514..8c50b46e8 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -56,6 +56,7 @@ from youtube_dl.utils import ( read_batch_urls, sanitize_filename, sanitize_path, + expand_path, prepend_extension, replace_extension, remove_start, @@ -95,6 +96,8 @@ from youtube_dl.utils import ( from youtube_dl.compat import ( compat_chr, compat_etree_fromstring, + compat_getenv, + compat_setenv, compat_urlparse, compat_parse_qs, ) @@ -214,6 +217,13 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') + def test_expand_path(self): + compat_setenv('YOUTUBE-DL-EXPATH-PATH', 'expanded') + self.assertEqual(expand_path('%YOUTUBE-DL-EXPATH-PATH%'), 'expanded') + self.assertEqual(expand_path('%HOMEPATH%'), compat_getenv('HOMEPATH')) + self.assertEqual(expand_path('~'), compat_getenv('HOME')) + self.assertEqual(expand_path('~/%YOUTUBE-DL-EXPATH-PATH%'), '%s/expanded' % compat_getenv('HOME')) + def test_prepend_extension(self): self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext') self.assertEqual(prepend_extension('abc.ext', 'temp', 'ext'), 'abc.temp.ext') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d293c7498..2340bc306 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ from .compat import ( compat_basestring, compat_chr, compat_etree_fromstring, + compat_expanduser, compat_html_entities, compat_html_entities_html5, compat_http_client, @@ -539,6 +540,11 @@ def sanitized_Request(url, *args, **kwargs): return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) +def expand_path(s): + """Expand shell variables and ~""" + return os.path.expandvars(compat_expanduser(s)) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] From 590bc6f6a1cb513852a22f6db0ee36e9bd138f64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Mar 2017 02:31:16 +0700 Subject: [PATCH 067/200] Use expand_path where appropriate (closes #12556) --- youtube_dl/YoutubeDL.py | 6 +++--- youtube_dl/__init__.py | 8 ++++---- youtube_dl/cache.py | 9 ++++++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index cb502c26f..21586f0f4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -29,7 +29,6 @@ import random from .compat import ( compat_basestring, compat_cookiejar, - compat_expanduser, compat_get_terminal_size, compat_http_client, compat_kwargs, @@ -54,6 +53,7 @@ from .utils import ( encode_compat_str, encodeFilename, error_to_compat_str, + expand_path, ExtractorError, format_bytes, formatSeconds, @@ -672,7 +672,7 @@ class YoutubeDL(object): FORMAT_RE.format(numeric_field), r'%({0})s'.format(numeric_field), outtmpl) - tmpl = compat_expanduser(outtmpl) + tmpl = expand_path(outtmpl) filename = tmpl % template_dict # Temporary fix for #4787 # 'Treat' all problem characters by passing filename through preferredencoding @@ -2170,7 +2170,7 @@ class YoutubeDL(object): if opts_cookiefile is None: self.cookiejar = compat_cookiejar.CookieJar() else: - opts_cookiefile = compat_expanduser(opts_cookiefile) + opts_cookiefile = expand_path(opts_cookiefile) self.cookiejar = compat_cookiejar.MozillaCookieJar( opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 2f640607f..f15606568 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -16,7 +16,6 @@ from .options import ( parseOpts, ) from .compat import ( - compat_expanduser, compat_getpass, compat_shlex_split, workaround_optparse_bug9161, @@ -26,6 +25,7 @@ from .utils import ( decodeOption, DEFAULT_OUTTMPL, DownloadError, + expand_path, match_filter_func, MaxDownloadsReached, preferredencoding, @@ -88,7 +88,7 @@ def _real_main(argv=None): batchfd = sys.stdin else: batchfd = io.open( - compat_expanduser(opts.batchfile), + expand_path(opts.batchfile), 'r', encoding='utf-8', errors='ignore') batch_urls = read_batch_urls(batchfd) if opts.verbose: @@ -238,7 +238,7 @@ def _real_main(argv=None): any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json any_printing = opts.print_json - download_archive_fn = compat_expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive + download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive # PostProcessors postprocessors = [] @@ -449,7 +449,7 @@ def _real_main(argv=None): try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename)) + retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) else: retcode = ydl.download(all_urls) except MaxDownloadsReached: diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py index 5fe839eb1..7bdade1bd 100644 --- a/youtube_dl/cache.py +++ b/youtube_dl/cache.py @@ -8,8 +8,11 @@ import re import shutil import traceback -from .compat import compat_expanduser, compat_getenv -from .utils import write_json_file +from .compat import compat_getenv +from .utils import ( + expand_path, + write_json_file, +) class Cache(object): @@ -21,7 +24,7 @@ class Cache(object): if res is None: cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') res = os.path.join(cache_root, 'youtube-dl') - return compat_expanduser(res) + return expand_path(res) def _get_cache_fn(self, section, key, dtype): assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \ From 5b7cc56b05ff4e3936da7a7c0bec5f8d5c9f27c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Mar 2017 02:32:14 +0700 Subject: [PATCH 068/200] [atresplayer] PEP 8 --- youtube_dl/extractor/atresplayer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index bfda1e24e..99af6dc5a 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -156,8 +156,8 @@ class AtresPlayerIE(InfoExtractor): if format_id == 'token' or not video_url.startswith('http'): continue if 'geodeswowsmpra3player' in video_url: - f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] - f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) + # f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] + # f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) # this videos are protected by DRM, the f4m downloader doesn't support them continue video_url_hd = video_url.replace('free_es', 'es') From 15495cf3e5f8dbab5559936006df77e3ac0a370b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Mar 2017 02:32:46 +0700 Subject: [PATCH 069/200] [franceculture] PEP 8 --- youtube_dl/extractor/franceculture.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index df3d757f3..b8fa17588 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -6,7 +6,6 @@ from ..utils import ( determine_ext, extract_attributes, int_or_none, - unified_strdate, ) From d212c93d16bbb6aeb8645f1dbb15a78f3d9414a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Mar 2017 02:34:25 +0700 Subject: [PATCH 070/200] [pluralsight] PEP 8 --- youtube_dl/extractor/pluralsight.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 0c6e036ca..e45d9fe55 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -169,11 +169,10 @@ class PluralsightIE(PluralsightBaseIE): collection = course['modules'] - module, clip = None, None + clip = None for module_ in collection: if name in (module_.get('moduleName'), module_.get('name')): - module = module_ for clip_ in module_.get('clips', []): clip_index = clip_.get('clipIndex') if clip_index is None: From 41c5e60dd57c0df10f4aa05dee95af2bbc1dc8dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Mar 2017 03:07:56 +0700 Subject: [PATCH 071/200] [test_utils] Fix expand_path tests --- test/test_utils.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 8c50b46e8..b9a02666d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -218,11 +218,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_path('./../abc'), '..\\abc') def test_expand_path(self): + def env(var): + return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var) + compat_setenv('YOUTUBE-DL-EXPATH-PATH', 'expanded') - self.assertEqual(expand_path('%YOUTUBE-DL-EXPATH-PATH%'), 'expanded') - self.assertEqual(expand_path('%HOMEPATH%'), compat_getenv('HOMEPATH')) + self.assertEqual(expand_path(env('YOUTUBE-DL-EXPATH-PATH')), 'expanded') + self.assertEqual(expand_path(env('HOMEPATH')), compat_getenv('HOMEPATH')) self.assertEqual(expand_path('~'), compat_getenv('HOME')) - self.assertEqual(expand_path('~/%YOUTUBE-DL-EXPATH-PATH%'), '%s/expanded' % compat_getenv('HOME')) + self.assertEqual( + expand_path('~/%s' % env('YOUTUBE-DL-EXPATH-PATH')), + '%s/expanded' % compat_getenv('HOME')) def test_prepend_extension(self): self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext') From a426ef6d783038e570db252a2e9e72800ffcb381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Mar 2017 03:22:48 +0700 Subject: [PATCH 072/200] [test_utils] Do not use dash in env variables' names --- test/test_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index b9a02666d..aa4569b81 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -221,12 +221,12 @@ class TestUtil(unittest.TestCase): def env(var): return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var) - compat_setenv('YOUTUBE-DL-EXPATH-PATH', 'expanded') - self.assertEqual(expand_path(env('YOUTUBE-DL-EXPATH-PATH')), 'expanded') - self.assertEqual(expand_path(env('HOMEPATH')), compat_getenv('HOMEPATH')) + compat_setenv('YOUTUBE_DL_EXPATH_PATH', 'expanded') + self.assertEqual(expand_path(env('YOUTUBE_DL_EXPATH_PATH')), 'expanded') + self.assertEqual(expand_path(env('HOME')), compat_getenv('HOME')) self.assertEqual(expand_path('~'), compat_getenv('HOME')) self.assertEqual( - expand_path('~/%s' % env('YOUTUBE-DL-EXPATH-PATH')), + expand_path('~/%s' % env('YOUTUBE_DL_EXPATH_PATH')), '%s/expanded' % compat_getenv('HOME')) def test_prepend_extension(self): From 942b44a0525f677924c660bcb00902d705d91fc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Mar 2017 03:24:25 +0700 Subject: [PATCH 073/200] [test_compat] Do not use dash in env variables' names --- test/test_compat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index b57424948..d6c54e135 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -27,11 +27,11 @@ from youtube_dl.compat import ( class TestCompat(unittest.TestCase): def test_compat_getenv(self): test_str = 'тест' - compat_setenv('YOUTUBE-DL-TEST', test_str) - self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str) + compat_setenv('YOUTUBE_DL_COMPAT_GETENV', test_str) + self.assertEqual(compat_getenv('YOUTUBE_DL_COMPAT_GETENV'), test_str) def test_compat_setenv(self): - test_var = 'YOUTUBE-DL-TEST' + test_var = 'YOUTUBE_DL_COMPAT_SETENV' test_str = 'тест' compat_setenv(test_var, test_str) compat_getenv(test_var) From 2bfaf89b6cc6dd07ed6ca32086c72a98b67c20ba Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 25 Mar 2017 23:06:33 +0100 Subject: [PATCH 074/200] [downloader/hls] move check for m3u8 live streams to get_suitable_downloader --- youtube_dl/downloader/__init__.py | 3 +++ youtube_dl/downloader/hls.py | 21 ++++++++------------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 16952e359..2e485df9d 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -43,6 +43,9 @@ def get_suitable_downloader(info_dict, params={}): if ed.can_download(info_dict): return ed + if protocol.startswith('m3u8') and info_dict.get('is_live'): + return FFmpegFD + if protocol == 'm3u8' and params.get('hls_prefer_native') is True: return HlsFD diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 7534e4da5..4989abce1 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -30,15 +30,6 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' - def _delegate_to_ffmpeg(self, filename, info_dict): - self.report_warning( - 'hlsnative has detected features it does not support, ' - 'extraction will be delegated to ffmpeg') - fd = FFmpegFD(self.ydl, self.params) - for ph in self._progress_hooks: - fd.add_progress_hook(ph) - return fd.real_download(filename, info_dict) - @staticmethod def can_download(manifest, info_dict): UNSUPPORTED_FEATURES = ( @@ -62,12 +53,10 @@ class HlsFD(FragmentFD): ) check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest) + check_results.append(not info_dict.get('is_live')) return all(check_results) def real_download(self, filename, info_dict): - if info_dict.get('is_live'): - return self._delegate_to_ffmpeg(filename, info_dict) - man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) @@ -79,7 +68,13 @@ class HlsFD(FragmentFD): if info_dict.get('extra_param_to_segment_url'): self.report_error('pycrypto not found. Please install it.') return False - return self._delegate_to_ffmpeg(filename, info_dict) + self.report_warning( + 'hlsnative has detected features it does not support, ' + 'extraction will be delegated to ffmpeg') + fd = FFmpegFD(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + return fd.real_download(filename, info_dict) total_frags = 0 for line in s.splitlines(): From f7923a4c399e0ce8e6cd230db92aefbfcff297c3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 26 Mar 2017 22:07:12 +0800 Subject: [PATCH 075/200] [ChangeLog] Update after #12307 --- ChangeLog | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ChangeLog b/ChangeLog index 45d6f244d..adc64053b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ version <unreleased> +Core +* Don't raise an error if JWPlayer config data is not a Javascript object + literal. _find_jwplayer_data() now returns a dict rather than an str. + (#12307) + Extractors * [afreecatv] Fix extraction (#12179) From 82eefd0be00b7557782ae75602b463e226dd964f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Mar 2017 23:39:12 +0700 Subject: [PATCH 076/200] [ChangeLog] Actualize --- ChangeLog | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index adc64053b..e79067cff 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,11 +2,21 @@ version <unreleased> Core * Don't raise an error if JWPlayer config data is not a Javascript object - literal. _find_jwplayer_data() now returns a dict rather than an str. - (#12307) + literal. _find_jwplayer_data now returns a dict rather than an str. (#12307) +* Expand environment variables for options representing paths (#12556) ++ [utils] Introduce expand_path +* [downloader/hls] Delegate downloading to ffmpeg immediately for live streams Extractors * [afreecatv] Fix extraction (#12179) ++ [atvat] Add support for atv.at (#5325) ++ [fox] Add metadata extraction (#12391) ++ [atresplayer] Extract DASH formats ++ [atresplayer] Extract HD manifest (#12548) +* [atresplayer] Fix login error detection (#12548) +* [franceculture] Fix extraction (#12547) +* [youtube] Improve URL regular expression (#12538) +* [generic] Do not follow redirects to the same URL version 2017.03.24 From 9e691da06791a0a617ed69ef21e272536e247ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 26 Mar 2017 08:11:40 +0700 Subject: [PATCH 077/200] release 2017.03.26 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index dfff41d2d..2f717926c 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.24** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.26*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.26** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.03.24 +[debug] youtube-dl version 2017.03.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index e79067cff..07725b12a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.03.26 Core * Don't raise an error if JWPlayer config data is not a Javascript object diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 7c99ba3c2..e9dbc021b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -67,6 +67,7 @@ - **arte.tv:playlist** - **AtresPlayer** - **ATTTechChannel** + - **ATVAt** - **AudiMedia** - **AudioBoom** - **audiomack** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 13904c724..94e8198ec 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.03.24' +__version__ = '2017.03.26' From aea1dccbd07b073ef36b325a9a21eb3f642322d9 Mon Sep 17 00:00:00 2001 From: Tithen-Firion <tithen.firion.0@gmail.com> Date: Tue, 28 Mar 2017 15:42:03 +0200 Subject: [PATCH 078/200] [openload] fix extractor --- youtube_dl/extractor/openload.py | 73 +++++++++++++------------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 58ffde541..d8036b54a 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -75,51 +75,38 @@ class OpenloadIE(InfoExtractor): '<span[^>]+id="[^"]+"[^>]*>([0-9A-Za-z]+)</span>', webpage, 'openload ID') - video_url_chars = [] - - first_char = ord(ol_id[0]) - key = first_char - 55 - maxKey = max(2, key) - key = min(maxKey, len(ol_id) - 38) - t = ol_id[key:key + 36] - - hashMap = {} - v = ol_id.replace(t, '') - h = 0 - - while h < len(t): - f = t[h:h + 3] - i = int(f, 8) - hashMap[h / 3] = i - h += 3 - - h = 0 - H = 0 - while h < len(v): - B = '' - C = '' - if len(v) >= h + 2: - B = v[h:h + 2] - if len(v) >= h + 3: - C = v[h:h + 3] - i = int(B, 16) - h += 2 - if H % 3 == 0: - i = int(C, 8) - h += 1 - elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60: - i = int(C, 10) - h += 1 - index = H % 7 - - A = hashMap[index] - i ^= 213 - i ^= A - video_url_chars.append(compat_chr(i)) - H += 1 + decoded = '' + a = ol_id[0:24] + b = [] + for i in range(0, len(a), 8): + b.append(int(a[i:i + 8] or '0', 16)) + ol_id = ol_id[24:] + j = 0 + k = 0 + while j < len(ol_id): + c = 128 + d = 0 + e = 0 + f = 0 + _more = True + while _more: + if j + 1 >= len(ol_id): + c = 143 + f = int(ol_id[j:j + 2] or '0', 16) + j += 2 + d += (f & 127) << e + e += 7 + _more = f >= c + g = d ^ b[k % 3] + for i in range(4): + char_dec = (g >> 8 * i) & (c + 127) + char = compat_chr(char_dec) + if char != '#': + decoded += char + k += 1 video_url = 'https://openload.co/stream/%s?mime=true' - video_url = video_url % (''.join(video_url_chars)) + video_url = video_url % decoded title = self._og_search_title(webpage, default=None) or self._search_regex( r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, From 12ee65ea0d09c6ac42ad06b3d561b4a26db00cfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 28 Mar 2017 23:35:48 +0700 Subject: [PATCH 079/200] [options] Mention ISM for --fragment-retries and --skip-unavailable-fragments --- youtube_dl/options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 6b811535f..2d2f5e47b 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -459,11 +459,11 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--fragment-retries', dest='fragment_retries', metavar='RETRIES', default=10, - help='Number of retries for a fragment (default is %default), or "infinite" (DASH and hlsnative only)') + help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)') downloader.add_option( '--skip-unavailable-fragments', action='store_true', dest='skip_unavailable_fragments', default=True, - help='Skip unavailable fragments (DASH and hlsnative only)') + help='Skip unavailable fragments (DASH, hlsnative and ISM)') downloader.add_option( '--abort-on-unavailable-fragment', action='store_false', dest='skip_unavailable_fragments', From 128244657b92582f7f4793c2d1be86b04032ac7f Mon Sep 17 00:00:00 2001 From: plroman <dev@plr.pm> Date: Tue, 28 Mar 2017 23:23:20 +0200 Subject: [PATCH 080/200] [allocine] Fix extraction --- youtube_dl/extractor/allocine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 90f11d39f..0463a070b 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -70,7 +70,7 @@ class AllocineIE(InfoExtractor): if model: model_data = self._parse_json(model, display_id) - for video_url in model_data['sources'].values(): + for video_url in model_data['videos'][0]['sources'].values(): video_id, format_id = url_basename(video_url).split('_')[:2] formats.append({ 'format_id': format_id, @@ -78,7 +78,7 @@ class AllocineIE(InfoExtractor): 'url': video_url, }) - title = model_data['title'] + title = model_data['videos'][0]['title'] else: video_id = display_id media_data = self._download_json( From 639e5b2a848c0a73e8525472dd8bb4b14a8c4746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 29 Mar 2017 04:43:12 +0700 Subject: [PATCH 081/200] [allocine] Extract more metadata --- youtube_dl/extractor/allocine.py | 46 +++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 0463a070b..cd533acfc 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -2,9 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - remove_end, + int_or_none, qualities, + remove_end, + try_get, + unified_timestamp, url_basename, ) @@ -22,6 +26,10 @@ class AllocineIE(InfoExtractor): 'title': 'Astérix - Le Domaine des Dieux Teaser VF', 'description': 'md5:4a754271d9c6f16c72629a8a993ee884', 'thumbnail': r're:http://.*\.jpg', + 'duration': 39, + 'timestamp': 1404273600, + 'upload_date': '20140702', + 'view_count': int, }, }, { 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html', @@ -33,6 +41,10 @@ class AllocineIE(InfoExtractor): 'title': 'Planes 2 Bande-annonce VF', 'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway', 'thumbnail': r're:http://.*\.jpg', + 'duration': 69, + 'timestamp': 1385659800, + 'upload_date': '20131128', + 'view_count': int, }, }, { 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19544709&cfilm=181290.html', @@ -44,6 +56,10 @@ class AllocineIE(InfoExtractor): 'title': 'Dragons 2 - Bande annonce finale VF', 'description': 'md5:6cdd2d7c2687d4c6aafe80a35e17267a', 'thumbnail': r're:http://.*\.jpg', + 'duration': 144, + 'timestamp': 1397589900, + 'upload_date': '20140415', + 'view_count': int, }, }, { 'url': 'http://www.allocine.fr/video/video-19550147/', @@ -69,34 +85,37 @@ class AllocineIE(InfoExtractor): r'data-model="([^"]+)"', webpage, 'data model', default=None) if model: model_data = self._parse_json(model, display_id) - - for video_url in model_data['videos'][0]['sources'].values(): + video = model_data['videos'][0] + title = video['title'] + for video_url in video['sources'].values(): video_id, format_id = url_basename(video_url).split('_')[:2] formats.append({ 'format_id': format_id, 'quality': quality(format_id), 'url': video_url, }) - - title = model_data['videos'][0]['title'] + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('view_count')) + timestamp = unified_timestamp(try_get( + video, lambda x: x['added_at']['date'], compat_str)) else: video_id = display_id media_data = self._download_json( 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) + title = remove_end( + self._html_search_regex( + r'(?s)<title>(.+?)', webpage, 'title').strip(), + ' - AlloCiné') for key, value in media_data['video'].items(): if not key.endswith('Path'): continue - format_id = key[:-len('Path')] formats.append({ 'format_id': format_id, 'quality': quality(format_id), 'url': value, }) - - title = remove_end(self._html_search_regex( - r'(?s)(.+?)', webpage, 'title' - ).strip(), ' - AlloCiné') + duration, view_count, timestamp = [None] * 3 self._sort_formats(formats) @@ -104,7 +123,10 @@ class AllocineIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'formats': formats, 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'formats': formats, } From 82be732b174ea8e9984e7b0582c69e41b266d1da Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 31 Mar 2017 12:24:23 +0100 Subject: [PATCH 082/200] [adn] Add new extractor --- youtube_dl/extractor/adn.py | 136 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 137 insertions(+) create mode 100644 youtube_dl/extractor/adn.py diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py new file mode 100644 index 000000000..e44caa00b --- /dev/null +++ b/youtube_dl/extractor/adn.py @@ -0,0 +1,136 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import json +import os + +from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import compat_ord +from ..utils import ( + bytes_to_intlist, + ExtractorError, + float_or_none, + intlist_to_bytes, + srt_subtitles_timecode, + strip_or_none, +) + + +class ADNIE(InfoExtractor): + IE_DESC = 'Anime Digital Network' + _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P\d+)' + _TEST = { + 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', + 'md5': 'e497370d847fd79d9d4c74be55575c7a', + 'info_dict': { + 'id': '7778', + 'ext': 'mp4', + 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1', + 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', + } + } + + def _get_subtitles(self, sub_path, video_id): + if not sub_path: + return None + + enc_subtitles = self._download_webpage( + 'http://animedigitalnetwork.fr/' + sub_path, + video_id, fatal=False) + if not enc_subtitles: + return None + + # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js + dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( + bytes_to_intlist(base64.b64decode(enc_subtitles[24:])), + bytes_to_intlist(b'\xb5@\xcfq\xa3\x98"N\xe4\xf3\x12\x98}}\x16\xd8'), + bytes_to_intlist(base64.b64decode(enc_subtitles[:24])) + )) + subtitles_json = self._parse_json( + dec_subtitles[:-compat_ord(dec_subtitles[-1])], + None, fatal=False) + if not subtitles_json: + return None + + subtitles = {} + for sub_lang, sub in subtitles_json.items(): + srt = '' + for num, current in enumerate(sub): + start, end, text = ( + float_or_none(current.get('startTime')), + float_or_none(current.get('endTime')), + current.get('text')) + if start is None or end is None or text is None: + continue + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + + if sub_lang == 'vostf': + sub_lang = 'fr' + subtitles.setdefault(sub_lang, []).extend([{ + 'ext': 'json', + 'data': json.dumps(sub), + }, { + 'ext': 'srt', + 'data': srt, + }]) + return subtitles + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_config = self._parse_json(self._search_regex( + r'playerConfig\s*=\s*({.+});', webpage, 'player config'), video_id) + + video_info = {} + video_info_str = self._search_regex( + r'videoInfo\s*=\s*({.+});', webpage, + 'video info', fatal=False) + if video_info_str: + video_info = self._parse_json( + video_info_str, video_id, fatal=False) or {} + + options = player_config.get('options') or {} + metas = options.get('metas') or {} + title = metas.get('title') or video_info['title'] + links = player_config.get('links') or {} + + formats = [] + for format_id, qualities in links.items(): + for load_balancer_url in qualities.values(): + load_balancer_data = self._download_json( + load_balancer_url, video_id, fatal=False) or {} + m3u8_url = load_balancer_data.get('location') + if not m3u8_url: + continue + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False) + if format_id == 'vf': + for f in m3u8_formats: + f['language'] = 'fr' + formats.extend(m3u8_formats) + error = options.get('error') + if not formats and error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(metas.get('summary') or video_info.get('resume')), + 'thumbnail': video_info.get('image'), + 'formats': formats, + 'subtitles': self.extract_subtitles(player_config.get('subtitles'), video_id), + 'episode': metas.get('subtitle') or video_info.get('videoTitle'), + 'series': video_info.get('playlistTitle'), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6a7028a4d..43933ad5b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -19,6 +19,7 @@ from .acast import ( ACastChannelIE, ) from .addanime import AddAnimeIE +from .adn import ADNIE from .adobetv import ( AdobeTVIE, AdobeTVShowIE, From 3e943cfe09eda6ef9b0fa419fdd22155fbaa047f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 31 Mar 2017 14:54:06 +0100 Subject: [PATCH 083/200] [generic] pass base_url to _parse_jwplayer_data --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 274f81738..73911940c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2568,7 +2568,7 @@ class GenericIE(InfoExtractor): webpage, video_id, transform_source=js_to_json) if jwplayer_data: info = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False) + jwplayer_data, video_id, require_title=False, base_url=url) if not info.get('title'): info['title'] = video_title return info From 1640eb096166c81918125a0a7462eb2edb063167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 31 Mar 2017 23:57:35 +0700 Subject: [PATCH 084/200] [YoutubeDL] Return early when extraction of url_transparent fails --- youtube_dl/YoutubeDL.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 21586f0f4..54bc8b06d 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -837,6 +837,12 @@ class YoutubeDL(object): ie_result['url'], ie_key=ie_result.get('ie_key'), extra_info=extra_info, download=False, process=False) + # extract_info may return None when ignoreerrors is enabled and + # extraction failed with an error, don't crash and return early + # in this case + if not info: + return info + force_properties = dict( (k, v) for k, v in ie_result.items() if v is not None) for f in ('_type', 'url', 'ie_key'): From 7453999580f2809153a84420d3ca72b24186c02b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 Apr 2017 00:25:27 +0700 Subject: [PATCH 085/200] [packtpub] Add extractor (closes #12610) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/packtpub.py | 138 +++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 youtube_dl/extractor/packtpub.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 43933ad5b..6ad7444fe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -729,6 +729,10 @@ from .orf import ( ORFFM4IE, ORFIPTVIE, ) +from .packtpub import ( + PacktPubIE, + PacktPubCourseIE, +) from .pandatv import PandaTVIE from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py new file mode 100644 index 000000000..881f3bcc7 --- /dev/null +++ b/youtube_dl/extractor/packtpub.py @@ -0,0 +1,138 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + remove_end, + strip_or_none, + unified_timestamp, + urljoin, +) + + +class PacktPubBaseIE(InfoExtractor): + _PACKT_BASE = 'https://www.packtpub.com' + _MAPT_REST = '%s/mapt-rest' % _PACKT_BASE + + +class PacktPubIE(PacktPubBaseIE): + _VALID_URL = r'https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P\d+)/(?P\d+)/(?P\d+)' + + _TEST = { + 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro', + 'md5': '1e74bd6cfd45d7d07666f4684ef58f70', + 'info_dict': { + 'id': '20530', + 'ext': 'mp4', + 'title': 'Project Intro', + 'thumbnail': r're:(?i)^https?://.*\.jpg', + 'timestamp': 1490918400, + 'upload_date': '20170331', + }, + } + + def _handle_error(self, response): + if response.get('status') != 'success': + raise ExtractorError( + '% said: %s' % (self.IE_NAME, response['message']), + expected=True) + + def _download_json(self, *args, **kwargs): + response = super(PacktPubIE, self)._download_json(*args, **kwargs) + self._handle_error(response) + return response + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id, chapter_id, video_id = mobj.group( + 'course_id', 'chapter_id', 'id') + + video = self._download_json( + '%s/users/me/products/%s/chapters/%s/sections/%s' + % (self._MAPT_REST, course_id, chapter_id, video_id), video_id, + 'Downloading JSON video')['data'] + + content = video.get('content') + if not content: + raise ExtractorError('This video is locked', expected=True) + + video_url = content['file'] + + metadata = self._download_json( + '%s/products/%s/chapters/%s/sections/%s/metadata' + % (self._MAPT_REST, course_id, chapter_id, video_id), + video_id)['data'] + + title = metadata['pageTitle'] + course_title = metadata.get('title') + if course_title: + title = remove_end(title, ' - %s' % course_title) + timestamp = unified_timestamp(metadata.get('publicationDate')) + thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath')) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + } + + +class PacktPubCourseIE(PacktPubBaseIE): + _VALID_URL = r'(?Phttps?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P\d+))' + _TEST = { + 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215', + 'info_dict': { + 'id': '9781787122215', + 'title': 'Learn Nodejs by building 12 projects [Video]', + }, + 'playlist_count': 90, + } + + @classmethod + def suitable(cls, url): + return False if PacktPubIE.suitable(url) else super( + PacktPubCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url, course_id = mobj.group('url', 'id') + + course = self._download_json( + '%s/products/%s/metadata' % (self._MAPT_REST, course_id), + course_id)['data'] + + entries = [] + for chapter_num, chapter in enumerate(course['tableOfContents'], 1): + if chapter.get('type') != 'chapter': + continue + children = chapter.get('children') + if not isinstance(children, list): + continue + chapter_info = { + 'chapter': chapter.get('title'), + 'chapter_number': chapter_num, + 'chapter_id': chapter.get('id'), + } + for section in children: + if section.get('type') != 'section': + continue + section_url = section.get('seoUrl') + if not isinstance(section_url, compat_str): + continue + entry = { + '_type': 'url_transparent', + 'url': urljoin(url + '/', section_url), + 'title': strip_or_none(section.get('title')), + 'description': clean_html(section.get('summary')), + 'ie_key': PacktPubIE.ie_key(), + } + entry.update(chapter_info) + entries.append(entry) + + return self.playlist_result(entries, course_id, course.get('title')) From 77c8ebe6318055cc34eaedca63f4866c4c47437a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 31 Mar 2017 23:28:24 +0100 Subject: [PATCH 086/200] [vrv] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/vrv.py | 151 +++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 youtube_dl/extractor/vrv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6ad7444fe..1b427e256 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1182,6 +1182,7 @@ from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE +from .vrv import VRVIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py new file mode 100644 index 000000000..33618c951 --- /dev/null +++ b/youtube_dl/extractor/vrv.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import json +import hashlib +import hmac +import random +import string +import time + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_urlparse, +) +from ..utils import ( + float_or_none, + int_or_none, +) + + +class VRVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P[A-Z0-9]+)' + _TEST = { + 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', + 'info_dict': { + 'id': 'GR9PNZ396', + 'ext': 'mp4', + 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', + 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', + 'uploader_id': 'seeso', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + _API_DOMAIN = None + _API_PARAMS = {} + _CMS_SIGNING = {} + + def _call_api(self, path, video_id, note, data=None): + base_url = self._API_DOMAIN + '/core/' + path + encoded_query = compat_urllib_parse_urlencode({ + 'oauth_consumer_key': self._API_PARAMS['oAuthKey'], + 'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'oauth_signature_method': 'HMAC-SHA1', + 'oauth_timestamp': int(time.time()), + 'oauth_version': '1.0', + }) + headers = self.geo_verification_headers() + if data: + data = json.dumps(data).encode() + headers['Content-Type'] = 'application/json' + method = 'POST' if data else 'GET' + base_string = '&'.join([method, compat_urlparse.quote(base_url, ''), compat_urlparse.quote(encoded_query, '')]) + oauth_signature = base64.b64encode(hmac.new( + (self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'), + base_string.encode(), hashlib.sha1).digest()).decode() + encoded_query += '&oauth_signature=' + compat_urlparse.quote(oauth_signature, '') + return self._download_json( + '?'.join([base_url, encoded_query]), video_id, + note='Downloading %s JSON metadata' % note, headers=headers, data=data) + + def _call_cms(self, path, video_id, note): + return self._download_json( + self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, + note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) + + def _set_api_params(self, webpage, video_id): + if not self._API_PARAMS: + self._API_PARAMS = self._parse_json(self._search_regex( + r'window\.__APP_CONFIG__\s*=\s*({.+?})', + webpage, 'api config'), video_id)['cxApiParams'] + self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') + + def _set_cms_signing(self, video_id): + if not self._CMS_SIGNING: + self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing'] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url, video_id, + headers=self.geo_verification_headers()) + media_resource = self._parse_json(self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*({.+?})', + webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {} + + video_data = media_resource.get('json') + if not video_data: + self._set_api_params(webpage, video_id) + episode_path = self._call_api('cms_resource', video_id, 'episode resource path', data={ + 'resource_key': 'cms:/episodes/' + video_id, + })['__links__']['cms_resource']['href'] + self._set_cms_signing(video_id) + video_data = self._call_cms(episode_path, video_id, 'video') + title = video_data['title'] + + streams_json = media_resource.get('streams', {}).get('json', {}) + if not streams_json: + self._set_api_params(webpage, video_id) + streams_path = video_data['__links__']['streams']['href'] + self._set_cms_signing(video_id) + streams_json = self._call_cms(streams_path, video_id, 'streams') + + audio_locale = streams_json.get('audio_locale') + formats = [] + for stream_id, stream in streams_json.get('streams', {}).get('adaptive_hls', {}).items(): + stream_url = stream.get('url') + if not stream_url: + continue + stream_id = stream_id or audio_locale + m3u8_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id=stream_id, + note='Downloading %s m3u8 information' % stream_id, + fatal=False) + if audio_locale: + for f in m3u8_formats: + f['language'] = audio_locale + formats.extend(m3u8_formats) + self._sort_formats(formats) + + thumbnails = [] + for thumbnail in video_data.get('images', {}).get('thumbnails', []): + thumbnail_url = thumbnail.get('source') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video_data.get('description'), + 'duration': float_or_none(video_data.get('duration_ms'), 1000), + 'uploader_id': video_data.get('channel_id'), + 'series': video_data.get('series_title'), + 'season': video_data.get('season_title'), + 'season_number': int_or_none(video_data.get('season_number')), + 'season_id': video_data.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('episode_number')), + 'episode_id': video_data.get('production_episode_id'), + } From be61efdf1754d026f270f6d87446040231d56954 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 07:26:40 +0100 Subject: [PATCH 087/200] [tvplay] Bypass geo restriction --- youtube_dl/extractor/tvplay.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 3eda0a399..99ff82a5d 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -225,7 +225,11 @@ class TVPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - + geo_country = self._search_regex( + r'https?://[^/]+\.([a-z]{2})', url, + 'geo country', default=None) + if geo_country: + self._initialize_geo_bypass([geo_country.upper()]) video = self._download_json( 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') From e97fc8d6b837921ea8429727f026238b857e1b31 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 07:50:24 +0100 Subject: [PATCH 088/200] [cwtv] extract ISM formats --- youtube_dl/extractor/cwtv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index 1ab9333b2..f4cf0f1c5 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -82,6 +82,11 @@ class CWTVIE(InfoExtractor): 'url': quality_url, 'tbr': tbr, }) + video_metadata = video_data['assetFields'] + ism_url = video_metadata.get('smoothStreamingUrl') + if ism_url: + formats.extend(self._extract_ism_formats( + ism_url, video_id, ism_id='mss', fatal=False)) self._sort_formats(formats) thumbnails = [{ @@ -90,8 +95,6 @@ class CWTVIE(InfoExtractor): 'height': image.get('height'), } for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None - video_metadata = video_data['assetFields'] - subtitles = { 'en': [{ 'url': video_metadata['UnicornCcUrl'], From ca77b92f94010bdf2d44de44cb23e32075b7dcaa Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 09:33:23 +0100 Subject: [PATCH 089/200] [crunchyroll] pass geo verifcation proxy --- youtube_dl/extractor/crunchyroll.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index d15fd3744..2ed8b30bb 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -390,7 +390,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text else: webpage_url = 'http://www.' + mobj.group('url') - webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage') + webpage = self._download_webpage( + self._add_skip_wall(webpage_url), video_id, + headers=self.geo_verification_headers()) note_m = self._html_search_regex( r'
(.+?)
', webpage, 'trailer-notice', default='') @@ -565,7 +567,9 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): def _real_extract(self, url): show_id = self._match_id(url) - webpage = self._download_webpage(self._add_skip_wall(url), show_id) + webpage = self._download_webpage( + self._add_skip_wall(url), show_id, + headers=self.geo_verification_headers()) title = self._html_search_regex( r'(?s)]*>\s*(.*?)', webpage, 'title') From 2cd668ee591df4f271ed4394ba9b38262ae3c40e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 Apr 2017 18:55:48 +0700 Subject: [PATCH 090/200] [xfileshare] Improve extraction and extract hls formats --- youtube_dl/extractor/xfileshare.py | 57 +++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index e616adce3..6de5b26d7 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( decode_packed_codes, + determine_ext, ExtractorError, int_or_none, NO_DEFAULT, @@ -95,6 +96,16 @@ class XFileShareIE(InfoExtractor): # removed by administrator 'url': 'http://xvidstage.com/amfy7atlkx25', 'only_matching': True, + }, { + 'url': 'http://vidabc.com/i8ybqscrphfv', + 'info_dict': { + 'id': 'i8ybqscrphfv', + 'ext': 'mp4', + 'title': 're:Beauty and the Beast 2017', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -133,31 +144,45 @@ class XFileShareIE(InfoExtractor): webpage, 'title', default=None) or self._og_search_title( webpage, default=None) or video_id).strip() - def extract_video_url(default=NO_DEFAULT): - return self._search_regex( - (r'file\s*:\s*(["\'])(?Phttp.+?)\1,', - r'file_link\s*=\s*(["\'])(?Phttp.+?)\1', - r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?Phttp.+?)\2\)', - r']+src=(["\'])(?Phttp.+?)\1'), - webpage, 'file url', default=default, group='url') + def extract_formats(default=NO_DEFAULT): + urls = [] + for regex in ( + r'file\s*:\s*(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', + r'file_link\s*=\s*(["\'])(?Phttp(?:(?!\1).)+)\1', + r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?Phttp(?:(?!\2).)+)\2\)', + r']+src=(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): + for mobj in re.finditer(regex, webpage): + video_url = mobj.group('url') + if video_url not in urls: + urls.append(video_url) + formats = [] + for video_url in urls: + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': 'sd', + }) + if not formats and default is not NO_DEFAULT: + return default + self._sort_formats(formats) + return formats - video_url = extract_video_url(default=None) + formats = extract_formats(default=None) - if not video_url: + if not formats: webpage = decode_packed_codes(self._search_regex( r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))", webpage, 'packed code')) - video_url = extract_video_url() + formats = extract_formats() thumbnail = self._search_regex( r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None) - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'quality': 1, - }] - return { 'id': video_id, 'title': title, From eecea00d36f29f3b22e5936ed48fa91456ab066a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 Apr 2017 18:56:35 +0700 Subject: [PATCH 091/200] [xfileshare] Add support for vidabc.com (closes #12589) --- youtube_dl/extractor/xfileshare.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 6de5b26d7..6856fb3bf 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -27,6 +27,7 @@ class XFileShareIE(InfoExtractor): ('vidto.me', 'Vidto'), ('streamin.to', 'Streamin.To'), ('xvidstage.com', 'XVIDSTAGE'), + ('vidabc.com', 'Vid ABC'), ) IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) From 91399b2fcc95e72f052ee9eab8e12b68d1815c9e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 13:32:38 +0100 Subject: [PATCH 092/200] [funimation] fix extraction(closes #10696)(#11773) --- youtube_dl/extractor/funimation.py | 211 ++++++++++------------------- 1 file changed, 73 insertions(+), 138 deletions(-) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index eba00cd5a..e44a2a87f 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -7,9 +7,9 @@ from ..compat import ( compat_urllib_parse_unquote_plus, ) from ..utils import ( - clean_html, determine_ext, int_or_none, + js_to_json, sanitized_Request, ExtractorError, urlencode_postdata @@ -17,34 +17,26 @@ from ..utils import ( class FunimationIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P[^/?#&]+)' _NETRC_MACHINE = 'funimation' _TESTS = [{ - 'url': 'http://www.funimation.com/shows/air/videos/official/breeze', + 'url': 'https://www.funimation.com/shows/hacksign/role-play/', 'info_dict': { - 'id': '658', - 'display_id': 'breeze', - 'ext': 'mp4', - 'title': 'Air - 1 - Breeze', - 'description': 'md5:1769f43cd5fc130ace8fd87232207892', - 'thumbnail': r're:https?://.*\.jpg', - }, - 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed', - }, { - 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play', - 'info_dict': { - 'id': '31128', + 'id': '91144', 'display_id': 'role-play', 'ext': 'mp4', - 'title': '.hack//SIGN - 1 - Role Play', + 'title': '.hack//SIGN - Role Play', 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', 'thumbnail': r're:https?://.*\.jpg', }, - 'skip': 'Access without user interaction is forbidden by CloudFlare', + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { - 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview', + 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/', 'info_dict': { 'id': '9635', 'display_id': 'broadcast-dub-preview', @@ -54,25 +46,13 @@ class FunimationIE(InfoExtractor): 'thumbnail': r're:https?://.*\.(?:jpg|png)', }, 'skip': 'Access without user interaction is forbidden by CloudFlare', + }, { + 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', + 'only_matching': True, }] _LOGIN_URL = 'http://www.funimation.com/login' - def _download_webpage(self, *args, **kwargs): - try: - return super(FunimationIE, self)._download_webpage(*args, **kwargs) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - response = ee.cause.read() - if b'>Please complete the security check to access<' in response: - raise ExtractorError( - 'Access to funimation.com is blocked by CloudFlare. ' - 'Please browse to http://www.funimation.com/, solve ' - 'the reCAPTCHA, export browser cookies to a text file,' - ' and then try again with --cookies YOUR_COOKIE_FILE.', - expected=True) - raise - def _extract_cloudflare_session_ua(self, url): ci_session_cookie = self._get_cookies(url).get('ci_session') if ci_session_cookie: @@ -114,119 +94,74 @@ class FunimationIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + def _search_kane(name): + return self._search_regex( + r"KANE_customdimensions\.%s\s*=\s*'([^']+)';" % name, + webpage, name, default=None) + + title_data = self._parse_json(self._search_regex( + r'TITLE_DATA\s*=\s*({[^}]+})', + webpage, 'title data', default=''), + display_id, js_to_json, fatal=False) or {} + + video_id = title_data.get('id') or self._search_regex([ + r"KANE_customdimensions.videoID\s*=\s*'(\d+)';", + r']+src="/player/(\d+)"', + ], webpage, 'video_id', default=None) + if not video_id: + player_url = self._html_search_meta([ + 'al:web:url', + 'og:video:url', + 'og:video:secure_url', + ], webpage, fatal=True) + video_id = self._search_regex(r'/player/(\d+)', player_url, 'video id') + + title = episode = title_data.get('title') or _search_kane('videoTitle') or self._og_search_title(webpage) + series = _search_kane('showName') + if series: + title = '%s - %s' % (series, title) + description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True) + + try: + sources = self._download_json( + 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id, + video_id)['items'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error = self._parse_json(e.cause.read(), video_id)['errors'][0] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error.get('detail') or error.get('title')), expected=True) + raise - errors = [] formats = [] - - ERRORS_MAP = { - 'ERROR_MATURE_CONTENT_LOGGED_IN': 'matureContentLoggedIn', - 'ERROR_MATURE_CONTENT_LOGGED_OUT': 'matureContentLoggedOut', - 'ERROR_SUBSCRIPTION_LOGGED_OUT': 'subscriptionLoggedOut', - 'ERROR_VIDEO_EXPIRED': 'videoExpired', - 'ERROR_TERRITORY_UNAVAILABLE': 'territoryUnavailable', - 'SVODBASIC_SUBSCRIPTION_IN_PLAYER': 'basicSubscription', - 'SVODNON_SUBSCRIPTION_IN_PLAYER': 'nonSubscription', - 'ERROR_PLAYER_NOT_RESPONDING': 'playerNotResponding', - 'ERROR_UNABLE_TO_CONNECT_TO_CDN': 'unableToConnectToCDN', - 'ERROR_STREAM_NOT_FOUND': 'streamNotFound', - } - - USER_AGENTS = ( - # PC UA is served with m3u8 that provides some bonus lower quality formats - ('pc', 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'), - # Mobile UA allows to extract direct links and also does not fail when - # PC UA fails with hulu error (e.g. - # http://www.funimation.com/shows/hacksign/videos/official/role-play) - ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'), - ) - - user_agent = self._extract_cloudflare_session_ua(url) - if user_agent: - USER_AGENTS = ((None, user_agent),) - - for kind, user_agent in USER_AGENTS: - request = sanitized_Request(url) - request.add_header('User-Agent', user_agent) - webpage = self._download_webpage( - request, display_id, - 'Downloading %s webpage' % kind if kind else 'Downloading webpage') - - playlist = self._parse_json( - self._search_regex( - r'var\s+playersData\s*=\s*(\[.+?\]);\n', - webpage, 'players data'), - display_id)[0]['playlist'] - - items = next(item['items'] for item in playlist if item.get('items')) - item = next(item for item in items if item.get('itemAK') == display_id) - - error_messages = {} - video_error_messages = self._search_regex( - r'var\s+videoErrorMessages\s*=\s*({.+?});\n', - webpage, 'error messages', default=None) - if video_error_messages: - error_messages_json = self._parse_json(video_error_messages, display_id, fatal=False) - if error_messages_json: - for _, error in error_messages_json.items(): - type_ = error.get('type') - description = error.get('description') - content = error.get('content') - if type_ == 'text' and description and content: - error_message = ERRORS_MAP.get(description) - if error_message: - error_messages[error_message] = content - - for video in item.get('videoSet', []): - auth_token = video.get('authToken') - if not auth_token: - continue - funimation_id = video.get('FUNImationID') or video.get('videoId') - preference = 1 if video.get('languageMode') == 'dub' else 0 - if not auth_token.startswith('?'): - auth_token = '?%s' % auth_token - for quality, height in (('sd', 480), ('hd', 720), ('hd1080', 1080)): - format_url = video.get('%sUrl' % quality) - if not format_url: - continue - if not format_url.startswith(('http', '//')): - errors.append(format_url) - continue - if determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url + auth_token, display_id, 'mp4', entry_protocol='m3u8_native', - preference=preference, m3u8_id='%s-hls' % funimation_id, fatal=False)) - else: - tbr = int_or_none(self._search_regex( - r'-(\d+)[Kk]', format_url, 'tbr', default=None)) - formats.append({ - 'url': format_url + auth_token, - 'format_id': '%s-http-%dp' % (funimation_id, height), - 'height': height, - 'tbr': tbr, - 'preference': preference, - }) - - if not formats and errors: - raise ExtractorError( - '%s returned error: %s' - % (self.IE_NAME, clean_html(error_messages.get(errors[0], errors[0]))), - expected=True) - + for source in sources: + source_url = source.get('src') + if not source_url: + continue + source_type = source.get('videoType') or determine_ext(source_url) + if source_type == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': source_type, + 'url': source_url, + }) self._sort_formats(formats) - title = item['title'] - artist = item.get('artist') - if artist: - title = '%s - %s' % (artist, title) - description = self._og_search_description(webpage) or item.get('description') - thumbnail = self._og_search_thumbnail(webpage) or item.get('posterUrl') - video_id = item.get('itemId') or display_id - return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': self._og_search_thumbnail(webpage), + 'series': series, + 'season_number': int_or_none(title_data.get('seasonNum') or _search_kane('season')), + 'episode_number': int_or_none(title_data.get('episodeNum')), + 'episode': episode, + 'season_id': title_data.get('seriesId'), 'formats': formats, } From a6f3a162f35cc05ac5a34773b438dd4c5f0d164a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 15:35:39 +0100 Subject: [PATCH 093/200] [limelight] improve extraction for audio only formats --- youtube_dl/extractor/limelight.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 422be2528..f52c2e169 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -62,13 +62,21 @@ class LimelightBaseIE(InfoExtractor): fmt = { 'url': stream_url, 'abr': float_or_none(stream.get('audioBitRate')), - 'vbr': float_or_none(stream.get('videoBitRate')), 'fps': float_or_none(stream.get('videoFrameRate')), - 'width': int_or_none(stream.get('videoWidthInPixels')), - 'height': int_or_none(stream.get('videoHeightInPixels')), 'ext': ext, } - rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp4:.+)$', stream_url) + width = int_or_none(stream.get('videoWidthInPixels')) + height = int_or_none(stream.get('videoHeightInPixels')) + vbr = float_or_none(stream.get('videoBitRate')) + if width or height or vbr: + fmt.update({ + 'width': width, + 'height': height, + 'vbr': vbr, + }) + else: + fmt['vcodec'] = 'none' + rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', stream_url) if rtmp: format_id = 'rtmp' if stream.get('videoBitRate'): From 48ab554feb9c6d3e0f13e1357e04f4c89089e2d3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 18:09:36 +0100 Subject: [PATCH 094/200] [vrv] add support for series pages --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/vrv.py | 88 ++++++++++++++++++++++-------- 2 files changed, 68 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1b427e256..980333a11 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1182,7 +1182,10 @@ from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE -from .vrv import VRVIE +from .vrv import ( + VRVIE, + VRVSeriesIE, +) from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 33618c951..487047fd7 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -20,22 +20,7 @@ from ..utils import ( ) -class VRVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P[A-Z0-9]+)' - _TEST = { - 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', - 'info_dict': { - 'id': 'GR9PNZ396', - 'ext': 'mp4', - 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', - 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', - 'uploader_id': 'seeso', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } +class VRVBaseIE(InfoExtractor): _API_DOMAIN = None _API_PARAMS = {} _CMS_SIGNING = {} @@ -64,6 +49,8 @@ class VRVIE(InfoExtractor): note='Downloading %s JSON metadata' % note, headers=headers, data=data) def _call_cms(self, path, video_id, note): + if not self._CMS_SIGNING: + self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing'] return self._download_json( self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) @@ -75,9 +62,30 @@ class VRVIE(InfoExtractor): webpage, 'api config'), video_id)['cxApiParams'] self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') - def _set_cms_signing(self, video_id): - if not self._CMS_SIGNING: - self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing'] + def _get_cms_resource(self, resource_key, video_id): + return self._call_api( + 'cms_resource', video_id, 'resource path', data={ + 'resource_key': resource_key, + })['__links__']['cms_resource']['href'] + + +class VRVIE(VRVBaseIE): + IE_NAME = 'vrv' + _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P[A-Z0-9]+)' + _TEST = { + 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', + 'info_dict': { + 'id': 'GR9PNZ396', + 'ext': 'mp4', + 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', + 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', + 'uploader_id': 'seeso', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } def _real_extract(self, url): video_id = self._match_id(url) @@ -91,10 +99,8 @@ class VRVIE(InfoExtractor): video_data = media_resource.get('json') if not video_data: self._set_api_params(webpage, video_id) - episode_path = self._call_api('cms_resource', video_id, 'episode resource path', data={ - 'resource_key': 'cms:/episodes/' + video_id, - })['__links__']['cms_resource']['href'] - self._set_cms_signing(video_id) + episode_path = self._get_cms_resource( + 'cms:/episodes/' + video_id, video_id) video_data = self._call_cms(episode_path, video_id, 'video') title = video_data['title'] @@ -102,7 +108,6 @@ class VRVIE(InfoExtractor): if not streams_json: self._set_api_params(webpage, video_id) streams_path = video_data['__links__']['streams']['href'] - self._set_cms_signing(video_id) streams_json = self._call_cms(streams_path, video_id, 'streams') audio_locale = streams_json.get('audio_locale') @@ -149,3 +154,38 @@ class VRVIE(InfoExtractor): 'episode_number': int_or_none(video_data.get('episode_number')), 'episode_id': video_data.get('production_episode_id'), } + + +class VRVSeriesIE(VRVBaseIE): + IE_NAME = 'vrv:series' + _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P[A-Z0-9]+)' + _TEST = { + 'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider', + 'info_dict': { + 'id': 'G68VXG3G6', + }, + 'playlist_mincount': 11, + } + + def _real_extract(self, url): + series_id = self._match_id(url) + webpage = self._download_webpage( + url, series_id, + headers=self.geo_verification_headers()) + + self._set_api_params(webpage, series_id) + seasons_path = self._get_cms_resource( + 'cms:/seasons?series_id=' + series_id, series_id) + seasons_data = self._call_cms(seasons_path, series_id, 'seasons') + + entries = [] + for season in seasons_data.get('items', []): + episodes_path = season['__links__']['season/episodes']['href'] + episodes = self._call_cms(episodes_path, series_id, 'episodes') + for episode in episodes.get('items', []): + episode_id = episode['id'] + entries.append(self.url_result( + 'https://vrv.co/watch/' + episode_id, + 'VRV', episode_id, episode.get('title'))) + + return self.playlist_result(entries, series_id) From 51342717cddafde83dbf39f2212be40a196a577a Mon Sep 17 00:00:00 2001 From: Timendum Date: Tue, 14 Mar 2017 16:11:09 +0100 Subject: [PATCH 095/200] [rai] Fix extraction --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/rai.py | 359 ++++++++++++++++------------- 2 files changed, 197 insertions(+), 164 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 980333a11..d9e8d53ac 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -802,7 +802,7 @@ from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import ( - RaiTVIE, + RaiPlayIE, RaiIE, ) from .rbmaradio import RBMARadioIE diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 41afbd9af..b67e94f88 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - determine_ext, ExtractorError, + determine_ext, find_xpath_attr, fix_xml_ampersands, int_or_none, @@ -55,7 +55,200 @@ class RaiBaseIE(InfoExtractor): return formats - def _extract_from_content_id(self, content_id, base_url): + +class RaiPlayIE(RaiBaseIE): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.html' + _TESTS = [{ + 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', + 'md5': '340aa3b7afb54bfd14a8c11786450d76', + 'info_dict': { + 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', + 'ext': 'mp4', + 'title': 'La Casa Bianca', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': r're:^Rai.+', + 'description': 're:^[A-Za-z]+' + } + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'md5': 'ed4da3d70ccf8129a33ab16b34d20ab8', + 'info_dict': { + 'id': 'efebe701-969c-4593-92f3-285f0d1ce750', + 'ext': 'mp4', + 'title': 'Gazebo - #gazebotraindesi', + 'thumbnail': r're:^https?://.*\.png$', + 'uploader': r're:^Rai.+', + 'description': r're:^[A-Za-z]+' + } + }, { + 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', + 'ext': 'mp4', + 'title': 'Report - Report del 07/04/2014', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': r're:^Rai.+', + 'description': r're:^[A-Za-z]+' + } + }] + _RESOLUTION = '600x400' + + def _real_extract(self, url): + video_id = self._match_id(url) + + # remove query and fragment part from url + canonical_url = compat_urlparse.urljoin(url, compat_urlparse.urlparse(url).path) + webpage = self._download_webpage(canonical_url, video_id) + + media = self._download_json('%s?json' % canonical_url, + video_id, 'Downloading video JSON') + + thumbnails = [] + if 'images' in media: + for _, value in media.get('images').items(): + if value: + thumbnails.append({ + 'url': value.replace('[RESOLUTION]', self._RESOLUTION) + }) + + if 'video' not in media: + raise ExtractorError('No video found') + + video = media.get('video') + duration = parse_duration(video.get('duration')), + formats = self._extract_relinker_formats(video.get('contentUrl'), video_id) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage).replace(' - video - RaiPlay', ''), + 'description': self._og_search_description(webpage), + 'uploader': media.get('channel'), + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats + } + + +class RaiIE(RaiBaseIE): + _VALID_URL = r'https?://.+\.(?:rai|rainews)\.it/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _TESTS = [{ + # subdomain test case + 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'info_dict': { + 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', + 'ext': 'mp4', + 'title': 'TG PRIMO TEMPO', + 'upload_date': '20140612', + 'duration': 1758, + 'thumbnail': r're:^https?://.*\.jpg$' + } + }, { + # rainews test case + 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', + 'info_dict': { + 'id': '1632c009-c843-4836-bb65-80c33084a64b', + 'ext': 'mp4', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor \"La ragazza del treno\" ', + 'upload_date': '20161103', + 'thumbnail': r're:^https?://.*\.png$', + 'description': r're:^[A-Za-z]+' + } + }, { + # with media information + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', + 'md5': '11959b4e44fa74de47011b5799490adf', + 'info_dict': { + 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 03/11/2016', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20161103', + 'description': r're:^[A-Za-z]+' + } + }, { + # drawMediaRaiTV test case + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', + 'info_dict': { + 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', + 'ext': 'mp4', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'upload_date': '20141221', + }, + }, { + # Direct relinker URL + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', + }, + }, { + # Embedded content item ID + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'd80d4b70-3812-4501-a888-92edec729f00', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'upload_date': r're:\d{8}', + 'description': r're:.+', + }, + }, { + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'mp4', + 'title': 'La diretta di Rainews24', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + iframe_url = self._search_regex( + [r']+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', + r'drawMediaRaiTV\(["\'](.+?)["\']'], + webpage, 'iframe', default=None) + if iframe_url: + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + return self.url_result(iframe_url) + + content_item_id = self._search_regex( + r'initEdizione\((?P[\'"])ContentItem-(?P[^\'"]+)(?P=q1)', + webpage, 'content item ID', group='content_id', default=None) + if content_item_id: + return self._extract_from_content_id(content_item_id, url) + + try: + return self._extract_from_content_id(video_id, url) + except ExtractorError: + # no media data, only direct relinker + pass + + relinker_url = compat_urlparse.urljoin(url, self._search_regex( + r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P[\'"])(?P(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', + webpage, 'relinker URL', group='url')) + formats = self._extract_relinker_formats(relinker_url, video_id) + self._sort_formats(formats) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P[^\'"]+)\1', + webpage, 'title', group='title', default=None) or self._og_search_title(webpage) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } + + def _extract_from_content_id(self, content_id, url): media = self._download_json( 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, content_id, 'Downloading video JSON') @@ -65,7 +258,7 @@ class RaiBaseIE(InfoExtractor): thumbnail_url = media.get(image_type) if thumbnail_url: thumbnails.append({ - 'url': compat_urlparse.urljoin(base_url, thumbnail_url), + 'url': compat_urlparse.urljoin(url, thumbnail_url), }) formats = [] @@ -105,163 +298,3 @@ class RaiBaseIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } - - -class RaiTVIE(RaiBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' - _TESTS = [ - { - 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': '8970abf8caf8aef4696e7b1f2adfc696', - 'info_dict': { - 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'mp4', - 'title': 'Report del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', - 'upload_date': '20140407', - 'duration': 6160, - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - { - # no m3u8 stream - 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - # HDS download, MD5 is unstable - 'info_dict': { - 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', - 'ext': 'flv', - 'title': 'TG PRIMO TEMPO', - 'upload_date': '20140612', - 'duration': 1758, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'skip': 'Geo-restricted to Italy', - }, - { - 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', - 'md5': '35cf7c229f22eeef43e48b5cf923bef0', - 'info_dict': { - 'id': '7aafdea9-0e5d-49d5-88a6-7e65da67ae13', - 'ext': 'mp4', - 'title': 'State of the Net, Antonella La Carpia: regole virali', - 'description': 'md5:b0ba04a324126903e3da7763272ae63c', - 'upload_date': '20140613', - }, - 'skip': 'Error 404', - }, - { - 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', - 'info_dict': { - 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132', - 'ext': 'mp4', - 'title': 'Alluvione in Sardegna e dissesto idrogeologico', - 'description': 'Edizione delle ore 20:30 ', - }, - 'skip': 'invalid urls', - }, - { - 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': 'e57493e1cb8bc7c564663f363b171847', - 'info_dict': { - 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', - 'ext': 'mp4', - 'title': 'Il Candidato - Primo episodio: "Le Primarie"', - 'description': 'md5:364b604f7db50594678f483353164fb8', - 'upload_date': '20140923', - 'duration': 386, - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - - return self._extract_from_content_id(video_id, url) - - -class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' - _TESTS = [ - { - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'upload_date': '20141221', - }, - }, - { - # Direct relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - # HDS live stream, MD5 is unstable - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'skip': 'Geo-restricted to Italy', - }, - { - # Embedded content item ID - 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', - 'md5': '84c1135ce960e8822ae63cec34441d63', - 'info_dict': { - 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8', - 'ext': 'mp4', - 'title': 'TG1 ore 20:00 del 02/07/2016', - 'upload_date': '20160702', - }, - }, - { - 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', - # HDS live stream, MD5 is unstable - 'info_dict': { - 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', - 'ext': 'flv', - 'title': 'La diretta di Rainews24', - }, - }, - ] - - @classmethod - def suitable(cls, url): - return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url) - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - iframe_url = self._search_regex( - [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', - r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe', default=None) - if iframe_url: - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) - - content_item_id = self._search_regex( - r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', - webpage, 'content item ID', group='content_id', default=None) - if content_item_id: - return self._extract_from_content_id(content_item_id, url) - - relinker_url = compat_urlparse.urljoin(url, self._search_regex( - r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', - webpage, 'relinker URL', group='url')) - formats = self._extract_relinker_formats(relinker_url, video_id) - self._sort_formats(formats) - - title = self._search_regex( - r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', - webpage, 'title', group='title', default=None) or self._og_search_title(webpage) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - } From b8d8cced9b55c57f3b09e83972be9d6318a459ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 02:14:42 +0700 Subject: [PATCH 096/200] [rai] Improve extraction (closes #11790) * Fix georestriction detection * Detect live streams + Extract relinker metadata * Improve ContentItem detection + Extract series metadata * Fix tests --- youtube_dl/extractor/rai.py | 359 +++++++++++++++++++++++------------- 1 file changed, 233 insertions(+), 126 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index b67e94f88..b77b0a08e 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,23 +1,40 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_str, +) from ..utils import ( ExtractorError, determine_ext, find_xpath_attr, fix_xml_ampersands, + GeoRestrictedError, int_or_none, parse_duration, + strip_or_none, + try_get, unified_strdate, + unified_timestamp, update_url_query, + urljoin, xpath_text, ) class RaiBaseIE(InfoExtractor): - def _extract_relinker_formats(self, relinker_url, video_id): + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _GEO_COUNTRIES = ['IT'] + _GEO_BYPASS = False + + def _extract_relinker_info(self, relinker_url, video_id): formats = [] + geoprotection = None + is_live = None + duration = None for platform in ('mon', 'flash', 'native'): relinker = self._download_xml( @@ -27,9 +44,27 @@ class RaiBaseIE(InfoExtractor): query={'output': 45, 'pl': platform}, headers=self.geo_verification_headers()) - media_url = find_xpath_attr(relinker, './url', 'type', 'content').text + if not geoprotection: + geoprotection = xpath_text( + relinker, './geoprotection', default=None) == 'Y' + + if not is_live: + is_live = xpath_text( + relinker, './is_live', default=None) == 'Y' + if not duration: + duration = parse_duration(xpath_text( + relinker, './duration', default=None)) + + url_elem = find_xpath_attr(relinker, './url', 'type', 'content') + if url_elem is None: + continue + + media_url = url_elem.text + + # This does not imply geo restriction (e.g. + # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) if media_url == 'http://download.rai.it/video_no_available.mp4': - self.raise_geo_restricted() + continue ext = determine_ext(media_url) if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): @@ -53,11 +88,18 @@ class RaiBaseIE(InfoExtractor): 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', }) - return formats + if not formats and geoprotection is True: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + + return dict((k, v) for k, v in { + 'is_live': is_live, + 'duration': duration, + 'formats': formats, + }.items() if v is not None) class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.html' + _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE _TESTS = [{ 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', 'md5': '340aa3b7afb54bfd14a8c11786450d76', @@ -65,110 +107,130 @@ class RaiPlayIE(RaiBaseIE): 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', 'ext': 'mp4', 'title': 'La Casa Bianca', + 'alt_title': 'S2016 - Puntata del 23/10/2016', + 'description': 'md5:a09d45890850458077d1f68bb036e0a5', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': r're:^Rai.+', - 'description': 're:^[A-Za-z]+' - } - }, { - 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', - 'md5': 'ed4da3d70ccf8129a33ab16b34d20ab8', - 'info_dict': { - 'id': 'efebe701-969c-4593-92f3-285f0d1ce750', - 'ext': 'mp4', - 'title': 'Gazebo - #gazebotraindesi', - 'thumbnail': r're:^https?://.*\.png$', - 'uploader': r're:^Rai.+', - 'description': r're:^[A-Za-z]+' - } + 'uploader': 'Rai 3', + 'creator': 'Rai 3', + 'duration': 3278, + 'timestamp': 1477764300, + 'upload_date': '20161029', + 'series': 'La Casa Bianca', + 'season': '2016', + }, }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', - 'title': 'Report - Report del 07/04/2014', + 'title': 'Report del 07/04/2014', + 'alt_title': 'S2013/14 - Puntata del 07/04/2014', + 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': r're:^Rai.+', - 'description': r're:^[A-Za-z]+' - } + 'uploader': 'Rai 5', + 'creator': 'Rai 5', + 'duration': 6160, + 'series': 'Report', + 'season_number': 5, + 'season': '2013/14', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'only_matching': True, }] - _RESOLUTION = '600x400' def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + url, video_id = mobj.group('url', 'id') - # remove query and fragment part from url - canonical_url = compat_urlparse.urljoin(url, compat_urlparse.urlparse(url).path) - webpage = self._download_webpage(canonical_url, video_id) + media = self._download_json( + '%s?json' % url, video_id, 'Downloading video JSON') - media = self._download_json('%s?json' % canonical_url, - video_id, 'Downloading video JSON') + title = media['name'] + + video = media['video'] + + relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + self._sort_formats(relinker_info['formats']) thumbnails = [] if 'images' in media: for _, value in media.get('images').items(): if value: thumbnails.append({ - 'url': value.replace('[RESOLUTION]', self._RESOLUTION) + 'url': value.replace('[RESOLUTION]', '600x400') }) - if 'video' not in media: - raise ExtractorError('No video found') + timestamp = unified_timestamp(try_get( + media, lambda x: x['availabilities'][0]['start'], compat_str)) - video = media.get('video') - duration = parse_duration(video.get('duration')), - formats = self._extract_relinker_formats(video.get('contentUrl'), video_id) - self._sort_formats(formats) - - return { + info = { 'id': video_id, - 'title': self._og_search_title(webpage).replace(' - video - RaiPlay', ''), - 'description': self._og_search_description(webpage), + 'title': title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description'), 'uploader': media.get('channel'), - 'duration': duration, + 'creator': media.get('editor'), + 'duration': parse_duration(video.get('duration')), + 'timestamp': timestamp, 'thumbnails': thumbnails, - 'formats': formats + 'series': try_get( + media, lambda x: x['isPartOf']['name'], compat_str), + 'season_number': int_or_none(try_get( + media, lambda x: x['isPartOf']['numeroStagioni'])), + 'season': media.get('stagione') or None, } + info.update(relinker_info) + + return info + class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://.+\.(?:rai|rainews)\.it/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ - # subdomain test case + # var uniquename = "ContentItem-..." + # data-id="ContentItem-..." 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'mp4', 'title': 'TG PRIMO TEMPO', - 'upload_date': '20140612', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, - 'thumbnail': r're:^https?://.*\.jpg$' + 'upload_date': '20140612', } }, { - # rainews test case + # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', 'info_dict': { 'id': '1632c009-c843-4836-bb65-80c33084a64b', 'ext': 'mp4', - 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor \"La ragazza del treno\" ', - 'upload_date': '20161103', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', + 'description': 'I film in uscita questa settimana.', 'thumbnail': r're:^https?://.*\.png$', - 'description': r're:^[A-Za-z]+' + 'duration': 833, + 'upload_date': '20161103', } }, { - # with media information + # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', 'md5': '11959b4e44fa74de47011b5799490adf', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', 'title': 'TG1 ore 20:00 del 03/11/2016', + 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2214, 'upload_date': '20161103', - 'description': r're:^[A-Za-z]+' } }, { - # drawMediaRaiTV test case + # drawMediaRaiTV(...) 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', 'md5': '2dd727e61114e1ee9c47f0da6914e178', 'info_dict': { @@ -176,83 +238,67 @@ class RaiIE(RaiBaseIE): 'ext': 'mp4', 'title': 'Il pacco', 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20141221', }, }, { - # Direct relinker URL + # initEdizione('ContentItem-...' + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'duration': 2274, + 'upload_date': '20170401', + }, + 'skip': 'Changes daily', + }, { + # HDS live stream with only relinker URL 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - # HDS live stream, MD5 is unstable 'info_dict': { 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', 'ext': 'flv', 'title': 'EuroNews', }, - }, { - # Embedded content item ID - 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', - 'info_dict': { - 'id': 'd80d4b70-3812-4501-a888-92edec729f00', - 'ext': 'mp4', - 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', - 'upload_date': r're:\d{8}', - 'description': r're:.+', + 'params': { + 'skip_download': True, }, }, { + # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', - # HDS live stream, MD5 is unstable 'info_dict': { 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', 'ext': 'mp4', 'title': 'La diretta di Rainews24', }, + 'params': { + 'skip_download': True, + }, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - iframe_url = self._search_regex( - [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', - r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe', default=None) - if iframe_url: - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) - - content_item_id = self._search_regex( - r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', - webpage, 'content item ID', group='content_id', default=None) - if content_item_id: - return self._extract_from_content_id(content_item_id, url) - - try: - return self._extract_from_content_id(video_id, url) - except ExtractorError: - # no media data, only direct relinker - pass - - relinker_url = compat_urlparse.urljoin(url, self._search_regex( - r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', - webpage, 'relinker URL', group='url')) - formats = self._extract_relinker_formats(relinker_url, video_id) - self._sort_formats(formats) - - title = self._search_regex( - r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', - webpage, 'title', group='title', default=None) or self._og_search_title(webpage) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - } - def _extract_from_content_id(self, content_id, url): media = self._download_json( 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, content_id, 'Downloading video JSON') + title = media['name'].strip() + + media_type = media['type'] + if 'Audio' in media_type: + relinker_info = { + 'formats': { + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + } + } + elif 'Video' in media_type: + relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) + else: + raise ExtractorError('not a media file') + + self._sort_formats(relinker_info['formats']) + thumbnails = [] for image_type in ('image', 'image_medium', 'image_300'): thumbnail_url = media.get(image_type) @@ -261,20 +307,6 @@ class RaiIE(RaiBaseIE): 'url': compat_urlparse.urljoin(url, thumbnail_url), }) - formats = [] - media_type = media['type'] - if 'Audio' in media_type: - formats.append({ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }) - elif 'Video' in media_type: - formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) - self._sort_formats(formats) - else: - raise ExtractorError('not a media file') - subtitles = {} captions = media.get('subtitlesUrl') if captions: @@ -287,14 +319,89 @@ class RaiIE(RaiBaseIE): 'url': captions, }] - return { + info = { 'id': content_id, - 'title': media['name'], - 'description': media.get('desc'), + 'title': title, + 'description': strip_or_none(media.get('desc')), 'thumbnails': thumbnails, 'uploader': media.get('author'), 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(media.get('length')), - 'formats': formats, 'subtitles': subtitles, } + + info.update(relinker_info) + + return info + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + content_item_id = None + + content_item_url = self._html_search_meta( + ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', + 'twitter:player', 'jsonlink'), webpage, default=None) + if content_item_url: + content_item_id = self._search_regex( + r'ContentItem-(%s)' % self._UUID_RE, content_item_url, + 'content item id', default=None) + + if not content_item_id: + content_item_id = self._search_regex( + r'''(?x) + (?: + (?:initEdizione|drawMediaRaiTV)\(| + <(?:[^>]+\bdata-id|var\s+uniquename)= + ) + (["\']) + (?:(?!\1).)*\bContentItem-(?P<id>%s) + ''' % self._UUID_RE, + webpage, 'content item id', default=None, group='id') + + content_item_ids = set() + content_item_ids.add(content_item_id) + if video_id not in content_item_ids: + content_item_ids.add(video_id) + + for content_item_id in content_item_ids: + try: + return self._extract_from_content_id(content_item_id, url) + except GeoRestrictedError: + raise + except ExtractorError: + pass + + relinker_url = self._search_regex( + r'''(?x) + (?: + var\s+videoURL| + mediaInfo\.mediaUri + )\s*=\s* + ([\'"]) + (?P<url> + (?:https?:)? + //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? + (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 + ''', + webpage, 'relinker URL', group='url') + + relinker_info = self._extract_relinker_info( + urljoin(url, relinker_url), video_id) + self._sort_formats(relinker_info['formats']) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + info = { + 'id': video_id, + 'title': title, + } + + info.update(relinker_info) + + return info From 361f293ab85c29ab62cb91577d2be34814d5c552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 02:24:13 +0700 Subject: [PATCH 097/200] [rai] Skip not found content item id --- youtube_dl/extractor/rai.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index b77b0a08e..077546a73 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -362,7 +362,8 @@ class RaiIE(RaiBaseIE): webpage, 'content item id', default=None, group='id') content_item_ids = set() - content_item_ids.add(content_item_id) + if content_item_id: + content_item_ids.add(content_item_id) if video_id not in content_item_ids: content_item_ids.add(video_id) From a76c25146a93052f367a0fb8cdd9a08ba9cef491 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 02:37:18 +0700 Subject: [PATCH 098/200] [ChangeLog] Actualize --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 07725b12a..3ffc647f1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version <unreleased> + +Core +[YoutubeDL] Return early when extraction of url_transparent fails + +Extractors +* [rai] Fix and improve extraction (#11790) ++ [vrv] Add support for series pages +* [limelight] Improve extraction for audio only formats +* [funimation] Fix extraction (#10696, #11773) ++ [xfileshare] Add support for vidabc.com (#12589) ++ [xfileshare] Improve extraction and extract hls formats ++ [crunchyroll] Pass geo verifcation proxy ++ [cwtv] Extract ISM formats ++ [tvplay] Bypass geo restriction ++ [vrv] Add support for vrv.co ++ [packtpub] Add support for packtpub.com (#12610) ++ [generic] Pass base_url to _parse_jwplayer_data ++ [adn] Add support for animedigitalnetwork.fr (#4866) ++ [allocine] Extract more metadata +* [allocine] Fix extraction (#12592) +* [openload] Fix extraction + + version 2017.03.26 Core From b56e41a701d73072b7d62a151b7aafd87955dfe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 02:39:15 +0700 Subject: [PATCH 099/200] release 2017.04.02 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 8 ++++---- docs/supportedsites.md | 9 +++++++-- youtube_dl/version.py | 2 +- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 2f717926c..c1b737619 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.26*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.02*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.03.26 +[debug] youtube-dl version 2017.04.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 3ffc647f1..0199bdf1f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.04.02 Core [YoutubeDL] Return early when extraction of url_transparent fails diff --git a/README.md b/README.md index 86b44781c..41f647aaa 100644 --- a/README.md +++ b/README.md @@ -181,10 +181,10 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo -R, --retries RETRIES Number of retries (default is 10), or "infinite". --fragment-retries RETRIES Number of retries for a fragment (default - is 10), or "infinite" (DASH and hlsnative - only) - --skip-unavailable-fragments Skip unavailable fragments (DASH and - hlsnative only) + is 10), or "infinite" (DASH, hlsnative and + ISM) + --skip-unavailable-fragments Skip unavailable fragments (DASH, hlsnative + and ISM) --abort-on-unavailable-fragment Abort downloading when some fragment is not available --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e9dbc021b..5c1855111 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -28,6 +28,7 @@ - **acast** - **acast:channel** - **AddAnime** + - **ADN**: Anime Digital Network - **AdobeTV** - **AdobeTVChannel** - **AdobeTVShow** @@ -572,6 +573,8 @@ - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek + - **PacktPub** + - **PacktPubCourse** - **PandaTV**: 熊猫TV - **pandora.tv**: 판도라TV - **parliamentlive.tv**: UK parliament videos @@ -629,7 +632,7 @@ - **radiofrance** - **RadioJavan** - **Rai** - - **RaiTV** + - **RaiPlay** - **RBMARadio** - **RDS**: RDS.ca - **RedBullTV** @@ -926,6 +929,8 @@ - **vpro**: npo.nl and ntr.nl - **Vrak** - **VRT** + - **vrv** + - **vrv:series** - **vube**: Vube.com - **VuClip** - **VVVVID** @@ -953,7 +958,7 @@ - **WSJ**: Wall Street Journal - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE + - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC - **XHamster** - **XHamsterEmbed** - **xiami:album**: 虾米音乐 - 专辑 diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 94e8198ec..f612d03ca 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.03.26' +__version__ = '2017.04.02' From b3633fa0ce0f98801582f8e4e348436b0f361eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 03:20:28 +0700 Subject: [PATCH 100/200] [pericope] Add support for pscp.tv URLs --- youtube_dl/extractor/periscope.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 0e3623024..1add6b840 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -20,7 +20,7 @@ class PeriscopeBaseIE(InfoExtractor): class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' - _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', @@ -41,6 +41,9 @@ class PeriscopeIE(PeriscopeBaseIE): }, { 'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX', 'only_matching': True, + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, }] @staticmethod @@ -103,7 +106,7 @@ class PeriscopeIE(PeriscopeBaseIE): class PeriscopeUserIE(PeriscopeBaseIE): - _VALID_URL = r'https?://(?:www\.)?periscope\.tv/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$' IE_DESC = 'Periscope user videos' IE_NAME = 'periscope:user' From 4457823dda410c5406f5ab5474b9b1f9325fa7ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 03:56:49 +0700 Subject: [PATCH 101/200] [extractor/common] Move censorship checks to a separate method and add check for just another ISP --- youtube_dl/extractor/common.py | 48 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6c3c095f7..cdfa7000b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -547,6 +547,34 @@ class InfoExtractor(object): return encoding + def __check_blocked(self, content): + first_block = content[:512] + if ('<title>Access to this site is blocked' in content and + 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'