From 03486dbb0133e42074c272f60e24f18c856fdf0d Mon Sep 17 00:00:00 2001 From: Random User Date: Sat, 25 Mar 2017 19:37:45 +0100 Subject: [PATCH 001/611] Add test for JWPlayer where config is passed as variable --- youtube_dl/extractor/generic.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9868ca6d0..c8c103ae3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -972,6 +972,20 @@ class GenericIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, + { + # JWPlayer config passed as variable + 'url': 'http://www.txxx.com/videos/3326530/ariele/', + 'info_dict': { + 'id': '3326530_hq', + 'ext': 'mp4', + 'title': 'ARIELE | Tube Cup', + 'uploader': 'www.txxx.com', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + } + }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', From c73e330e7adc9c0c15ac51aeea8fbb7dad95351a Mon Sep 17 00:00:00 2001 From: Random User Date: Sat, 25 Mar 2017 19:38:30 +0100 Subject: [PATCH 002/611] _find_jwplayer_data() returns dict or None This simplifies code for callers of `_find_jwplayer_data()` which no longer have to run `_parse_json()` on the return value. It also makes sure that `_find_jwplayer_data()` returns either a `dict` or `None` and nothing else. --- youtube_dl/extractor/common.py | 18 ++++++++++++------ youtube_dl/extractor/generic.py | 12 ++++-------- youtube_dl/extractor/tvnoe.py | 5 ++--- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb3c091aa..c2ca73ee1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2161,18 +2161,24 @@ class InfoExtractor(object): }) return formats - @staticmethod - def _find_jwplayer_data(webpage): + def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( r'jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P[^)]+)\)', webpage) if mobj: - return mobj.group('options') + try: + jwplayer_data = self._parse_json(mobj.group('options'), + video_id=video_id, + transform_source=transform_source) + except ExtractorError: + pass + else: + if isinstance(jwplayer_data, dict): + return jwplayer_data def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._parse_json( - self._find_jwplayer_data(webpage), video_id, - transform_source=js_to_json) + jwplayer_data = self._find_jwplayer_data( + webpage, video_id, transform_source=js_to_json) return self._parse_jwplayer_data( jwplayer_data, video_id, *args, **kwargs) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c8c103ae3..3fe0237b6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2518,14 +2518,10 @@ class GenericIE(InfoExtractor): self._sort_formats(entry['formats']) return self.playlist_result(entries) - jwplayer_data_str = self._find_jwplayer_data(webpage) - if jwplayer_data_str: - try: - jwplayer_data = self._parse_json( - jwplayer_data_str, video_id, transform_source=js_to_json) - return self._parse_jwplayer_data(jwplayer_data, video_id) - except ExtractorError: - pass + jwplayer_data = self._find_jwplayer_data( + webpage, video_id, transform_source=js_to_json) + if jwplayer_data: + return self._parse_jwplayer_data(jwplayer_data, video_id) def check_video(vurl): if YoutubeIE.suitable(vurl): diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py index 1a5b76bf2..26a5aeae4 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/youtube_dl/extractor/tvnoe.py @@ -31,9 +31,8 @@ class TVNoeIE(InfoExtractor): r']+src="([^"]+)"', webpage, 'iframe URL') ifs_page = self._download_webpage(iframe_url, video_id) - jwplayer_data = self._parse_json( - self._find_jwplayer_data(ifs_page), - video_id, transform_source=js_to_json) + jwplayer_data = self._find_jwplayer_data( + ifs_page, video_id, transform_source=js_to_json) info_dict = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=iframe_url) From f7923a4c399e0ce8e6cd230db92aefbfcff297c3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 26 Mar 2017 22:07:12 +0800 Subject: [PATCH 003/611] [ChangeLog] Update after #12307 --- ChangeLog | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ChangeLog b/ChangeLog index 45d6f244d..adc64053b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ version +Core +* Don't raise an error if JWPlayer config data is not a Javascript object + literal. _find_jwplayer_data() now returns a dict rather than an str. + (#12307) + Extractors * [afreecatv] Fix extraction (#12179) From 82eefd0be00b7557782ae75602b463e226dd964f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 26 Mar 2017 23:39:12 +0700 Subject: [PATCH 004/611] [ChangeLog] Actualize --- ChangeLog | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index adc64053b..e79067cff 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,11 +2,21 @@ version Core * Don't raise an error if JWPlayer config data is not a Javascript object - literal. _find_jwplayer_data() now returns a dict rather than an str. - (#12307) + literal. _find_jwplayer_data now returns a dict rather than an str. (#12307) +* Expand environment variables for options representing paths (#12556) ++ [utils] Introduce expand_path +* [downloader/hls] Delegate downloading to ffmpeg immediately for live streams Extractors * [afreecatv] Fix extraction (#12179) ++ [atvat] Add support for atv.at (#5325) ++ [fox] Add metadata extraction (#12391) ++ [atresplayer] Extract DASH formats ++ [atresplayer] Extract HD manifest (#12548) +* [atresplayer] Fix login error detection (#12548) +* [franceculture] Fix extraction (#12547) +* [youtube] Improve URL regular expression (#12538) +* [generic] Do not follow redirects to the same URL version 2017.03.24 From 9e691da06791a0a617ed69ef21e272536e247ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 26 Mar 2017 08:11:40 +0700 Subject: [PATCH 005/611] release 2017.03.26 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index dfff41d2d..2f717926c 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.24** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.26*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.26** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.03.24 +[debug] youtube-dl version 2017.03.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index e79067cff..07725b12a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.03.26 Core * Don't raise an error if JWPlayer config data is not a Javascript object diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 7c99ba3c2..e9dbc021b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -67,6 +67,7 @@ - **arte.tv:playlist** - **AtresPlayer** - **ATTTechChannel** + - **ATVAt** - **AudiMedia** - **AudioBoom** - **audiomack** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 13904c724..94e8198ec 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.03.24' +__version__ = '2017.03.26' From aea1dccbd07b073ef36b325a9a21eb3f642322d9 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Tue, 28 Mar 2017 15:42:03 +0200 Subject: [PATCH 006/611] [openload] fix extractor --- youtube_dl/extractor/openload.py | 73 +++++++++++++------------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 58ffde541..d8036b54a 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -75,51 +75,38 @@ class OpenloadIE(InfoExtractor): ']+id="[^"]+"[^>]*>([0-9A-Za-z]+)', webpage, 'openload ID') - video_url_chars = [] - - first_char = ord(ol_id[0]) - key = first_char - 55 - maxKey = max(2, key) - key = min(maxKey, len(ol_id) - 38) - t = ol_id[key:key + 36] - - hashMap = {} - v = ol_id.replace(t, '') - h = 0 - - while h < len(t): - f = t[h:h + 3] - i = int(f, 8) - hashMap[h / 3] = i - h += 3 - - h = 0 - H = 0 - while h < len(v): - B = '' - C = '' - if len(v) >= h + 2: - B = v[h:h + 2] - if len(v) >= h + 3: - C = v[h:h + 3] - i = int(B, 16) - h += 2 - if H % 3 == 0: - i = int(C, 8) - h += 1 - elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60: - i = int(C, 10) - h += 1 - index = H % 7 - - A = hashMap[index] - i ^= 213 - i ^= A - video_url_chars.append(compat_chr(i)) - H += 1 + decoded = '' + a = ol_id[0:24] + b = [] + for i in range(0, len(a), 8): + b.append(int(a[i:i + 8] or '0', 16)) + ol_id = ol_id[24:] + j = 0 + k = 0 + while j < len(ol_id): + c = 128 + d = 0 + e = 0 + f = 0 + _more = True + while _more: + if j + 1 >= len(ol_id): + c = 143 + f = int(ol_id[j:j + 2] or '0', 16) + j += 2 + d += (f & 127) << e + e += 7 + _more = f >= c + g = d ^ b[k % 3] + for i in range(4): + char_dec = (g >> 8 * i) & (c + 127) + char = compat_chr(char_dec) + if char != '#': + decoded += char + k += 1 video_url = 'https://openload.co/stream/%s?mime=true' - video_url = video_url % (''.join(video_url_chars)) + video_url = video_url % decoded title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, From 12ee65ea0d09c6ac42ad06b3d561b4a26db00cfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 28 Mar 2017 23:35:48 +0700 Subject: [PATCH 007/611] [options] Mention ISM for --fragment-retries and --skip-unavailable-fragments --- youtube_dl/options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 6b811535f..2d2f5e47b 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -459,11 +459,11 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--fragment-retries', dest='fragment_retries', metavar='RETRIES', default=10, - help='Number of retries for a fragment (default is %default), or "infinite" (DASH and hlsnative only)') + help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)') downloader.add_option( '--skip-unavailable-fragments', action='store_true', dest='skip_unavailable_fragments', default=True, - help='Skip unavailable fragments (DASH and hlsnative only)') + help='Skip unavailable fragments (DASH, hlsnative and ISM)') downloader.add_option( '--abort-on-unavailable-fragment', action='store_false', dest='skip_unavailable_fragments', From 128244657b92582f7f4793c2d1be86b04032ac7f Mon Sep 17 00:00:00 2001 From: plroman Date: Tue, 28 Mar 2017 23:23:20 +0200 Subject: [PATCH 008/611] [allocine] Fix extraction --- youtube_dl/extractor/allocine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 90f11d39f..0463a070b 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -70,7 +70,7 @@ class AllocineIE(InfoExtractor): if model: model_data = self._parse_json(model, display_id) - for video_url in model_data['sources'].values(): + for video_url in model_data['videos'][0]['sources'].values(): video_id, format_id = url_basename(video_url).split('_')[:2] formats.append({ 'format_id': format_id, @@ -78,7 +78,7 @@ class AllocineIE(InfoExtractor): 'url': video_url, }) - title = model_data['title'] + title = model_data['videos'][0]['title'] else: video_id = display_id media_data = self._download_json( From 639e5b2a848c0a73e8525472dd8bb4b14a8c4746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 29 Mar 2017 04:43:12 +0700 Subject: [PATCH 009/611] [allocine] Extract more metadata --- youtube_dl/extractor/allocine.py | 46 +++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 0463a070b..cd533acfc 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -2,9 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - remove_end, + int_or_none, qualities, + remove_end, + try_get, + unified_timestamp, url_basename, ) @@ -22,6 +26,10 @@ class AllocineIE(InfoExtractor): 'title': 'Astérix - Le Domaine des Dieux Teaser VF', 'description': 'md5:4a754271d9c6f16c72629a8a993ee884', 'thumbnail': r're:http://.*\.jpg', + 'duration': 39, + 'timestamp': 1404273600, + 'upload_date': '20140702', + 'view_count': int, }, }, { 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html', @@ -33,6 +41,10 @@ class AllocineIE(InfoExtractor): 'title': 'Planes 2 Bande-annonce VF', 'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway', 'thumbnail': r're:http://.*\.jpg', + 'duration': 69, + 'timestamp': 1385659800, + 'upload_date': '20131128', + 'view_count': int, }, }, { 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19544709&cfilm=181290.html', @@ -44,6 +56,10 @@ class AllocineIE(InfoExtractor): 'title': 'Dragons 2 - Bande annonce finale VF', 'description': 'md5:6cdd2d7c2687d4c6aafe80a35e17267a', 'thumbnail': r're:http://.*\.jpg', + 'duration': 144, + 'timestamp': 1397589900, + 'upload_date': '20140415', + 'view_count': int, }, }, { 'url': 'http://www.allocine.fr/video/video-19550147/', @@ -69,34 +85,37 @@ class AllocineIE(InfoExtractor): r'data-model="([^"]+)"', webpage, 'data model', default=None) if model: model_data = self._parse_json(model, display_id) - - for video_url in model_data['videos'][0]['sources'].values(): + video = model_data['videos'][0] + title = video['title'] + for video_url in video['sources'].values(): video_id, format_id = url_basename(video_url).split('_')[:2] formats.append({ 'format_id': format_id, 'quality': quality(format_id), 'url': video_url, }) - - title = model_data['videos'][0]['title'] + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('view_count')) + timestamp = unified_timestamp(try_get( + video, lambda x: x['added_at']['date'], compat_str)) else: video_id = display_id media_data = self._download_json( 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) + title = remove_end( + self._html_search_regex( + r'(?s)(.+?)', webpage, 'title').strip(), + ' - AlloCiné') for key, value in media_data['video'].items(): if not key.endswith('Path'): continue - format_id = key[:-len('Path')] formats.append({ 'format_id': format_id, 'quality': quality(format_id), 'url': value, }) - - title = remove_end(self._html_search_regex( - r'(?s)(.+?)', webpage, 'title' - ).strip(), ' - AlloCiné') + duration, view_count, timestamp = [None] * 3 self._sort_formats(formats) @@ -104,7 +123,10 @@ class AllocineIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'formats': formats, 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'formats': formats, } From 82be732b174ea8e9984e7b0582c69e41b266d1da Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 31 Mar 2017 12:24:23 +0100 Subject: [PATCH 010/611] [adn] Add new extractor --- youtube_dl/extractor/adn.py | 136 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 137 insertions(+) create mode 100644 youtube_dl/extractor/adn.py diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py new file mode 100644 index 000000000..e44caa00b --- /dev/null +++ b/youtube_dl/extractor/adn.py @@ -0,0 +1,136 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import json +import os + +from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import compat_ord +from ..utils import ( + bytes_to_intlist, + ExtractorError, + float_or_none, + intlist_to_bytes, + srt_subtitles_timecode, + strip_or_none, +) + + +class ADNIE(InfoExtractor): + IE_DESC = 'Anime Digital Network' + _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P\d+)' + _TEST = { + 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', + 'md5': 'e497370d847fd79d9d4c74be55575c7a', + 'info_dict': { + 'id': '7778', + 'ext': 'mp4', + 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1', + 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', + } + } + + def _get_subtitles(self, sub_path, video_id): + if not sub_path: + return None + + enc_subtitles = self._download_webpage( + 'http://animedigitalnetwork.fr/' + sub_path, + video_id, fatal=False) + if not enc_subtitles: + return None + + # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js + dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( + bytes_to_intlist(base64.b64decode(enc_subtitles[24:])), + bytes_to_intlist(b'\xb5@\xcfq\xa3\x98"N\xe4\xf3\x12\x98}}\x16\xd8'), + bytes_to_intlist(base64.b64decode(enc_subtitles[:24])) + )) + subtitles_json = self._parse_json( + dec_subtitles[:-compat_ord(dec_subtitles[-1])], + None, fatal=False) + if not subtitles_json: + return None + + subtitles = {} + for sub_lang, sub in subtitles_json.items(): + srt = '' + for num, current in enumerate(sub): + start, end, text = ( + float_or_none(current.get('startTime')), + float_or_none(current.get('endTime')), + current.get('text')) + if start is None or end is None or text is None: + continue + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + + if sub_lang == 'vostf': + sub_lang = 'fr' + subtitles.setdefault(sub_lang, []).extend([{ + 'ext': 'json', + 'data': json.dumps(sub), + }, { + 'ext': 'srt', + 'data': srt, + }]) + return subtitles + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_config = self._parse_json(self._search_regex( + r'playerConfig\s*=\s*({.+});', webpage, 'player config'), video_id) + + video_info = {} + video_info_str = self._search_regex( + r'videoInfo\s*=\s*({.+});', webpage, + 'video info', fatal=False) + if video_info_str: + video_info = self._parse_json( + video_info_str, video_id, fatal=False) or {} + + options = player_config.get('options') or {} + metas = options.get('metas') or {} + title = metas.get('title') or video_info['title'] + links = player_config.get('links') or {} + + formats = [] + for format_id, qualities in links.items(): + for load_balancer_url in qualities.values(): + load_balancer_data = self._download_json( + load_balancer_url, video_id, fatal=False) or {} + m3u8_url = load_balancer_data.get('location') + if not m3u8_url: + continue + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False) + if format_id == 'vf': + for f in m3u8_formats: + f['language'] = 'fr' + formats.extend(m3u8_formats) + error = options.get('error') + if not formats and error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(metas.get('summary') or video_info.get('resume')), + 'thumbnail': video_info.get('image'), + 'formats': formats, + 'subtitles': self.extract_subtitles(player_config.get('subtitles'), video_id), + 'episode': metas.get('subtitle') or video_info.get('videoTitle'), + 'series': video_info.get('playlistTitle'), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6a7028a4d..43933ad5b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -19,6 +19,7 @@ from .acast import ( ACastChannelIE, ) from .addanime import AddAnimeIE +from .adn import ADNIE from .adobetv import ( AdobeTVIE, AdobeTVShowIE, From 3e943cfe09eda6ef9b0fa419fdd22155fbaa047f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 31 Mar 2017 14:54:06 +0100 Subject: [PATCH 011/611] [generic] pass base_url to _parse_jwplayer_data --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 274f81738..73911940c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2568,7 +2568,7 @@ class GenericIE(InfoExtractor): webpage, video_id, transform_source=js_to_json) if jwplayer_data: info = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False) + jwplayer_data, video_id, require_title=False, base_url=url) if not info.get('title'): info['title'] = video_title return info From 1640eb096166c81918125a0a7462eb2edb063167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 31 Mar 2017 23:57:35 +0700 Subject: [PATCH 012/611] [YoutubeDL] Return early when extraction of url_transparent fails --- youtube_dl/YoutubeDL.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 21586f0f4..54bc8b06d 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -837,6 +837,12 @@ class YoutubeDL(object): ie_result['url'], ie_key=ie_result.get('ie_key'), extra_info=extra_info, download=False, process=False) + # extract_info may return None when ignoreerrors is enabled and + # extraction failed with an error, don't crash and return early + # in this case + if not info: + return info + force_properties = dict( (k, v) for k, v in ie_result.items() if v is not None) for f in ('_type', 'url', 'ie_key'): From 7453999580f2809153a84420d3ca72b24186c02b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 Apr 2017 00:25:27 +0700 Subject: [PATCH 013/611] [packtpub] Add extractor (closes #12610) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/packtpub.py | 138 +++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 youtube_dl/extractor/packtpub.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 43933ad5b..6ad7444fe 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -729,6 +729,10 @@ from .orf import ( ORFFM4IE, ORFIPTVIE, ) +from .packtpub import ( + PacktPubIE, + PacktPubCourseIE, +) from .pandatv import PandaTVIE from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py new file mode 100644 index 000000000..881f3bcc7 --- /dev/null +++ b/youtube_dl/extractor/packtpub.py @@ -0,0 +1,138 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + remove_end, + strip_or_none, + unified_timestamp, + urljoin, +) + + +class PacktPubBaseIE(InfoExtractor): + _PACKT_BASE = 'https://www.packtpub.com' + _MAPT_REST = '%s/mapt-rest' % _PACKT_BASE + + +class PacktPubIE(PacktPubBaseIE): + _VALID_URL = r'https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P\d+)/(?P\d+)/(?P\d+)' + + _TEST = { + 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro', + 'md5': '1e74bd6cfd45d7d07666f4684ef58f70', + 'info_dict': { + 'id': '20530', + 'ext': 'mp4', + 'title': 'Project Intro', + 'thumbnail': r're:(?i)^https?://.*\.jpg', + 'timestamp': 1490918400, + 'upload_date': '20170331', + }, + } + + def _handle_error(self, response): + if response.get('status') != 'success': + raise ExtractorError( + '% said: %s' % (self.IE_NAME, response['message']), + expected=True) + + def _download_json(self, *args, **kwargs): + response = super(PacktPubIE, self)._download_json(*args, **kwargs) + self._handle_error(response) + return response + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id, chapter_id, video_id = mobj.group( + 'course_id', 'chapter_id', 'id') + + video = self._download_json( + '%s/users/me/products/%s/chapters/%s/sections/%s' + % (self._MAPT_REST, course_id, chapter_id, video_id), video_id, + 'Downloading JSON video')['data'] + + content = video.get('content') + if not content: + raise ExtractorError('This video is locked', expected=True) + + video_url = content['file'] + + metadata = self._download_json( + '%s/products/%s/chapters/%s/sections/%s/metadata' + % (self._MAPT_REST, course_id, chapter_id, video_id), + video_id)['data'] + + title = metadata['pageTitle'] + course_title = metadata.get('title') + if course_title: + title = remove_end(title, ' - %s' % course_title) + timestamp = unified_timestamp(metadata.get('publicationDate')) + thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath')) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + } + + +class PacktPubCourseIE(PacktPubBaseIE): + _VALID_URL = r'(?Phttps?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P\d+))' + _TEST = { + 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215', + 'info_dict': { + 'id': '9781787122215', + 'title': 'Learn Nodejs by building 12 projects [Video]', + }, + 'playlist_count': 90, + } + + @classmethod + def suitable(cls, url): + return False if PacktPubIE.suitable(url) else super( + PacktPubCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url, course_id = mobj.group('url', 'id') + + course = self._download_json( + '%s/products/%s/metadata' % (self._MAPT_REST, course_id), + course_id)['data'] + + entries = [] + for chapter_num, chapter in enumerate(course['tableOfContents'], 1): + if chapter.get('type') != 'chapter': + continue + children = chapter.get('children') + if not isinstance(children, list): + continue + chapter_info = { + 'chapter': chapter.get('title'), + 'chapter_number': chapter_num, + 'chapter_id': chapter.get('id'), + } + for section in children: + if section.get('type') != 'section': + continue + section_url = section.get('seoUrl') + if not isinstance(section_url, compat_str): + continue + entry = { + '_type': 'url_transparent', + 'url': urljoin(url + '/', section_url), + 'title': strip_or_none(section.get('title')), + 'description': clean_html(section.get('summary')), + 'ie_key': PacktPubIE.ie_key(), + } + entry.update(chapter_info) + entries.append(entry) + + return self.playlist_result(entries, course_id, course.get('title')) From 77c8ebe6318055cc34eaedca63f4866c4c47437a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 31 Mar 2017 23:28:24 +0100 Subject: [PATCH 014/611] [vrv] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/vrv.py | 151 +++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 youtube_dl/extractor/vrv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6ad7444fe..1b427e256 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1182,6 +1182,7 @@ from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE +from .vrv import VRVIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py new file mode 100644 index 000000000..33618c951 --- /dev/null +++ b/youtube_dl/extractor/vrv.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import json +import hashlib +import hmac +import random +import string +import time + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_urlparse, +) +from ..utils import ( + float_or_none, + int_or_none, +) + + +class VRVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P[A-Z0-9]+)' + _TEST = { + 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', + 'info_dict': { + 'id': 'GR9PNZ396', + 'ext': 'mp4', + 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', + 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', + 'uploader_id': 'seeso', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + _API_DOMAIN = None + _API_PARAMS = {} + _CMS_SIGNING = {} + + def _call_api(self, path, video_id, note, data=None): + base_url = self._API_DOMAIN + '/core/' + path + encoded_query = compat_urllib_parse_urlencode({ + 'oauth_consumer_key': self._API_PARAMS['oAuthKey'], + 'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'oauth_signature_method': 'HMAC-SHA1', + 'oauth_timestamp': int(time.time()), + 'oauth_version': '1.0', + }) + headers = self.geo_verification_headers() + if data: + data = json.dumps(data).encode() + headers['Content-Type'] = 'application/json' + method = 'POST' if data else 'GET' + base_string = '&'.join([method, compat_urlparse.quote(base_url, ''), compat_urlparse.quote(encoded_query, '')]) + oauth_signature = base64.b64encode(hmac.new( + (self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'), + base_string.encode(), hashlib.sha1).digest()).decode() + encoded_query += '&oauth_signature=' + compat_urlparse.quote(oauth_signature, '') + return self._download_json( + '?'.join([base_url, encoded_query]), video_id, + note='Downloading %s JSON metadata' % note, headers=headers, data=data) + + def _call_cms(self, path, video_id, note): + return self._download_json( + self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, + note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) + + def _set_api_params(self, webpage, video_id): + if not self._API_PARAMS: + self._API_PARAMS = self._parse_json(self._search_regex( + r'window\.__APP_CONFIG__\s*=\s*({.+?})', + webpage, 'api config'), video_id)['cxApiParams'] + self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') + + def _set_cms_signing(self, video_id): + if not self._CMS_SIGNING: + self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing'] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url, video_id, + headers=self.geo_verification_headers()) + media_resource = self._parse_json(self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*({.+?})', + webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {} + + video_data = media_resource.get('json') + if not video_data: + self._set_api_params(webpage, video_id) + episode_path = self._call_api('cms_resource', video_id, 'episode resource path', data={ + 'resource_key': 'cms:/episodes/' + video_id, + })['__links__']['cms_resource']['href'] + self._set_cms_signing(video_id) + video_data = self._call_cms(episode_path, video_id, 'video') + title = video_data['title'] + + streams_json = media_resource.get('streams', {}).get('json', {}) + if not streams_json: + self._set_api_params(webpage, video_id) + streams_path = video_data['__links__']['streams']['href'] + self._set_cms_signing(video_id) + streams_json = self._call_cms(streams_path, video_id, 'streams') + + audio_locale = streams_json.get('audio_locale') + formats = [] + for stream_id, stream in streams_json.get('streams', {}).get('adaptive_hls', {}).items(): + stream_url = stream.get('url') + if not stream_url: + continue + stream_id = stream_id or audio_locale + m3u8_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id=stream_id, + note='Downloading %s m3u8 information' % stream_id, + fatal=False) + if audio_locale: + for f in m3u8_formats: + f['language'] = audio_locale + formats.extend(m3u8_formats) + self._sort_formats(formats) + + thumbnails = [] + for thumbnail in video_data.get('images', {}).get('thumbnails', []): + thumbnail_url = thumbnail.get('source') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video_data.get('description'), + 'duration': float_or_none(video_data.get('duration_ms'), 1000), + 'uploader_id': video_data.get('channel_id'), + 'series': video_data.get('series_title'), + 'season': video_data.get('season_title'), + 'season_number': int_or_none(video_data.get('season_number')), + 'season_id': video_data.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('episode_number')), + 'episode_id': video_data.get('production_episode_id'), + } From be61efdf1754d026f270f6d87446040231d56954 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 07:26:40 +0100 Subject: [PATCH 015/611] [tvplay] Bypass geo restriction --- youtube_dl/extractor/tvplay.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 3eda0a399..99ff82a5d 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -225,7 +225,11 @@ class TVPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - + geo_country = self._search_regex( + r'https?://[^/]+\.([a-z]{2})', url, + 'geo country', default=None) + if geo_country: + self._initialize_geo_bypass([geo_country.upper()]) video = self._download_json( 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') From e97fc8d6b837921ea8429727f026238b857e1b31 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 07:50:24 +0100 Subject: [PATCH 016/611] [cwtv] extract ISM formats --- youtube_dl/extractor/cwtv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index 1ab9333b2..f4cf0f1c5 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -82,6 +82,11 @@ class CWTVIE(InfoExtractor): 'url': quality_url, 'tbr': tbr, }) + video_metadata = video_data['assetFields'] + ism_url = video_metadata.get('smoothStreamingUrl') + if ism_url: + formats.extend(self._extract_ism_formats( + ism_url, video_id, ism_id='mss', fatal=False)) self._sort_formats(formats) thumbnails = [{ @@ -90,8 +95,6 @@ class CWTVIE(InfoExtractor): 'height': image.get('height'), } for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None - video_metadata = video_data['assetFields'] - subtitles = { 'en': [{ 'url': video_metadata['UnicornCcUrl'], From ca77b92f94010bdf2d44de44cb23e32075b7dcaa Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 09:33:23 +0100 Subject: [PATCH 017/611] [crunchyroll] pass geo verifcation proxy --- youtube_dl/extractor/crunchyroll.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index d15fd3744..2ed8b30bb 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -390,7 +390,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text else: webpage_url = 'http://www.' + mobj.group('url') - webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage') + webpage = self._download_webpage( + self._add_skip_wall(webpage_url), video_id, + headers=self.geo_verification_headers()) note_m = self._html_search_regex( r'
(.+?)
', webpage, 'trailer-notice', default='') @@ -565,7 +567,9 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): def _real_extract(self, url): show_id = self._match_id(url) - webpage = self._download_webpage(self._add_skip_wall(url), show_id) + webpage = self._download_webpage( + self._add_skip_wall(url), show_id, + headers=self.geo_verification_headers()) title = self._html_search_regex( r'(?s)]*>\s*(.*?)', webpage, 'title') From 2cd668ee591df4f271ed4394ba9b38262ae3c40e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 Apr 2017 18:55:48 +0700 Subject: [PATCH 018/611] [xfileshare] Improve extraction and extract hls formats --- youtube_dl/extractor/xfileshare.py | 57 +++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index e616adce3..6de5b26d7 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( decode_packed_codes, + determine_ext, ExtractorError, int_or_none, NO_DEFAULT, @@ -95,6 +96,16 @@ class XFileShareIE(InfoExtractor): # removed by administrator 'url': 'http://xvidstage.com/amfy7atlkx25', 'only_matching': True, + }, { + 'url': 'http://vidabc.com/i8ybqscrphfv', + 'info_dict': { + 'id': 'i8ybqscrphfv', + 'ext': 'mp4', + 'title': 're:Beauty and the Beast 2017', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -133,31 +144,45 @@ class XFileShareIE(InfoExtractor): webpage, 'title', default=None) or self._og_search_title( webpage, default=None) or video_id).strip() - def extract_video_url(default=NO_DEFAULT): - return self._search_regex( - (r'file\s*:\s*(["\'])(?Phttp.+?)\1,', - r'file_link\s*=\s*(["\'])(?Phttp.+?)\1', - r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?Phttp.+?)\2\)', - r']+src=(["\'])(?Phttp.+?)\1'), - webpage, 'file url', default=default, group='url') + def extract_formats(default=NO_DEFAULT): + urls = [] + for regex in ( + r'file\s*:\s*(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', + r'file_link\s*=\s*(["\'])(?Phttp(?:(?!\1).)+)\1', + r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?Phttp(?:(?!\2).)+)\2\)', + r']+src=(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): + for mobj in re.finditer(regex, webpage): + video_url = mobj.group('url') + if video_url not in urls: + urls.append(video_url) + formats = [] + for video_url in urls: + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': 'sd', + }) + if not formats and default is not NO_DEFAULT: + return default + self._sort_formats(formats) + return formats - video_url = extract_video_url(default=None) + formats = extract_formats(default=None) - if not video_url: + if not formats: webpage = decode_packed_codes(self._search_regex( r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))", webpage, 'packed code')) - video_url = extract_video_url() + formats = extract_formats() thumbnail = self._search_regex( r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None) - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'quality': 1, - }] - return { 'id': video_id, 'title': title, From eecea00d36f29f3b22e5936ed48fa91456ab066a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 Apr 2017 18:56:35 +0700 Subject: [PATCH 019/611] [xfileshare] Add support for vidabc.com (closes #12589) --- youtube_dl/extractor/xfileshare.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 6de5b26d7..6856fb3bf 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -27,6 +27,7 @@ class XFileShareIE(InfoExtractor): ('vidto.me', 'Vidto'), ('streamin.to', 'Streamin.To'), ('xvidstage.com', 'XVIDSTAGE'), + ('vidabc.com', 'Vid ABC'), ) IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) From 91399b2fcc95e72f052ee9eab8e12b68d1815c9e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 13:32:38 +0100 Subject: [PATCH 020/611] [funimation] fix extraction(closes #10696)(#11773) --- youtube_dl/extractor/funimation.py | 211 ++++++++++------------------- 1 file changed, 73 insertions(+), 138 deletions(-) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index eba00cd5a..e44a2a87f 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -7,9 +7,9 @@ from ..compat import ( compat_urllib_parse_unquote_plus, ) from ..utils import ( - clean_html, determine_ext, int_or_none, + js_to_json, sanitized_Request, ExtractorError, urlencode_postdata @@ -17,34 +17,26 @@ from ..utils import ( class FunimationIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P[^/?#&]+)' _NETRC_MACHINE = 'funimation' _TESTS = [{ - 'url': 'http://www.funimation.com/shows/air/videos/official/breeze', + 'url': 'https://www.funimation.com/shows/hacksign/role-play/', 'info_dict': { - 'id': '658', - 'display_id': 'breeze', - 'ext': 'mp4', - 'title': 'Air - 1 - Breeze', - 'description': 'md5:1769f43cd5fc130ace8fd87232207892', - 'thumbnail': r're:https?://.*\.jpg', - }, - 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed', - }, { - 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play', - 'info_dict': { - 'id': '31128', + 'id': '91144', 'display_id': 'role-play', 'ext': 'mp4', - 'title': '.hack//SIGN - 1 - Role Play', + 'title': '.hack//SIGN - Role Play', 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', 'thumbnail': r're:https?://.*\.jpg', }, - 'skip': 'Access without user interaction is forbidden by CloudFlare', + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { - 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview', + 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/', 'info_dict': { 'id': '9635', 'display_id': 'broadcast-dub-preview', @@ -54,25 +46,13 @@ class FunimationIE(InfoExtractor): 'thumbnail': r're:https?://.*\.(?:jpg|png)', }, 'skip': 'Access without user interaction is forbidden by CloudFlare', + }, { + 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', + 'only_matching': True, }] _LOGIN_URL = 'http://www.funimation.com/login' - def _download_webpage(self, *args, **kwargs): - try: - return super(FunimationIE, self)._download_webpage(*args, **kwargs) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - response = ee.cause.read() - if b'>Please complete the security check to access<' in response: - raise ExtractorError( - 'Access to funimation.com is blocked by CloudFlare. ' - 'Please browse to http://www.funimation.com/, solve ' - 'the reCAPTCHA, export browser cookies to a text file,' - ' and then try again with --cookies YOUR_COOKIE_FILE.', - expected=True) - raise - def _extract_cloudflare_session_ua(self, url): ci_session_cookie = self._get_cookies(url).get('ci_session') if ci_session_cookie: @@ -114,119 +94,74 @@ class FunimationIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + def _search_kane(name): + return self._search_regex( + r"KANE_customdimensions\.%s\s*=\s*'([^']+)';" % name, + webpage, name, default=None) + + title_data = self._parse_json(self._search_regex( + r'TITLE_DATA\s*=\s*({[^}]+})', + webpage, 'title data', default=''), + display_id, js_to_json, fatal=False) or {} + + video_id = title_data.get('id') or self._search_regex([ + r"KANE_customdimensions.videoID\s*=\s*'(\d+)';", + r']+src="/player/(\d+)"', + ], webpage, 'video_id', default=None) + if not video_id: + player_url = self._html_search_meta([ + 'al:web:url', + 'og:video:url', + 'og:video:secure_url', + ], webpage, fatal=True) + video_id = self._search_regex(r'/player/(\d+)', player_url, 'video id') + + title = episode = title_data.get('title') or _search_kane('videoTitle') or self._og_search_title(webpage) + series = _search_kane('showName') + if series: + title = '%s - %s' % (series, title) + description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True) + + try: + sources = self._download_json( + 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id, + video_id)['items'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error = self._parse_json(e.cause.read(), video_id)['errors'][0] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error.get('detail') or error.get('title')), expected=True) + raise - errors = [] formats = [] - - ERRORS_MAP = { - 'ERROR_MATURE_CONTENT_LOGGED_IN': 'matureContentLoggedIn', - 'ERROR_MATURE_CONTENT_LOGGED_OUT': 'matureContentLoggedOut', - 'ERROR_SUBSCRIPTION_LOGGED_OUT': 'subscriptionLoggedOut', - 'ERROR_VIDEO_EXPIRED': 'videoExpired', - 'ERROR_TERRITORY_UNAVAILABLE': 'territoryUnavailable', - 'SVODBASIC_SUBSCRIPTION_IN_PLAYER': 'basicSubscription', - 'SVODNON_SUBSCRIPTION_IN_PLAYER': 'nonSubscription', - 'ERROR_PLAYER_NOT_RESPONDING': 'playerNotResponding', - 'ERROR_UNABLE_TO_CONNECT_TO_CDN': 'unableToConnectToCDN', - 'ERROR_STREAM_NOT_FOUND': 'streamNotFound', - } - - USER_AGENTS = ( - # PC UA is served with m3u8 that provides some bonus lower quality formats - ('pc', 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'), - # Mobile UA allows to extract direct links and also does not fail when - # PC UA fails with hulu error (e.g. - # http://www.funimation.com/shows/hacksign/videos/official/role-play) - ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'), - ) - - user_agent = self._extract_cloudflare_session_ua(url) - if user_agent: - USER_AGENTS = ((None, user_agent),) - - for kind, user_agent in USER_AGENTS: - request = sanitized_Request(url) - request.add_header('User-Agent', user_agent) - webpage = self._download_webpage( - request, display_id, - 'Downloading %s webpage' % kind if kind else 'Downloading webpage') - - playlist = self._parse_json( - self._search_regex( - r'var\s+playersData\s*=\s*(\[.+?\]);\n', - webpage, 'players data'), - display_id)[0]['playlist'] - - items = next(item['items'] for item in playlist if item.get('items')) - item = next(item for item in items if item.get('itemAK') == display_id) - - error_messages = {} - video_error_messages = self._search_regex( - r'var\s+videoErrorMessages\s*=\s*({.+?});\n', - webpage, 'error messages', default=None) - if video_error_messages: - error_messages_json = self._parse_json(video_error_messages, display_id, fatal=False) - if error_messages_json: - for _, error in error_messages_json.items(): - type_ = error.get('type') - description = error.get('description') - content = error.get('content') - if type_ == 'text' and description and content: - error_message = ERRORS_MAP.get(description) - if error_message: - error_messages[error_message] = content - - for video in item.get('videoSet', []): - auth_token = video.get('authToken') - if not auth_token: - continue - funimation_id = video.get('FUNImationID') or video.get('videoId') - preference = 1 if video.get('languageMode') == 'dub' else 0 - if not auth_token.startswith('?'): - auth_token = '?%s' % auth_token - for quality, height in (('sd', 480), ('hd', 720), ('hd1080', 1080)): - format_url = video.get('%sUrl' % quality) - if not format_url: - continue - if not format_url.startswith(('http', '//')): - errors.append(format_url) - continue - if determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url + auth_token, display_id, 'mp4', entry_protocol='m3u8_native', - preference=preference, m3u8_id='%s-hls' % funimation_id, fatal=False)) - else: - tbr = int_or_none(self._search_regex( - r'-(\d+)[Kk]', format_url, 'tbr', default=None)) - formats.append({ - 'url': format_url + auth_token, - 'format_id': '%s-http-%dp' % (funimation_id, height), - 'height': height, - 'tbr': tbr, - 'preference': preference, - }) - - if not formats and errors: - raise ExtractorError( - '%s returned error: %s' - % (self.IE_NAME, clean_html(error_messages.get(errors[0], errors[0]))), - expected=True) - + for source in sources: + source_url = source.get('src') + if not source_url: + continue + source_type = source.get('videoType') or determine_ext(source_url) + if source_type == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': source_type, + 'url': source_url, + }) self._sort_formats(formats) - title = item['title'] - artist = item.get('artist') - if artist: - title = '%s - %s' % (artist, title) - description = self._og_search_description(webpage) or item.get('description') - thumbnail = self._og_search_thumbnail(webpage) or item.get('posterUrl') - video_id = item.get('itemId') or display_id - return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': self._og_search_thumbnail(webpage), + 'series': series, + 'season_number': int_or_none(title_data.get('seasonNum') or _search_kane('season')), + 'episode_number': int_or_none(title_data.get('episodeNum')), + 'episode': episode, + 'season_id': title_data.get('seriesId'), 'formats': formats, } From a6f3a162f35cc05ac5a34773b438dd4c5f0d164a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 15:35:39 +0100 Subject: [PATCH 021/611] [limelight] improve extraction for audio only formats --- youtube_dl/extractor/limelight.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 422be2528..f52c2e169 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -62,13 +62,21 @@ class LimelightBaseIE(InfoExtractor): fmt = { 'url': stream_url, 'abr': float_or_none(stream.get('audioBitRate')), - 'vbr': float_or_none(stream.get('videoBitRate')), 'fps': float_or_none(stream.get('videoFrameRate')), - 'width': int_or_none(stream.get('videoWidthInPixels')), - 'height': int_or_none(stream.get('videoHeightInPixels')), 'ext': ext, } - rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp4:.+)$', stream_url) + width = int_or_none(stream.get('videoWidthInPixels')) + height = int_or_none(stream.get('videoHeightInPixels')) + vbr = float_or_none(stream.get('videoBitRate')) + if width or height or vbr: + fmt.update({ + 'width': width, + 'height': height, + 'vbr': vbr, + }) + else: + fmt['vcodec'] = 'none' + rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', stream_url) if rtmp: format_id = 'rtmp' if stream.get('videoBitRate'): From 48ab554feb9c6d3e0f13e1357e04f4c89089e2d3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 1 Apr 2017 18:09:36 +0100 Subject: [PATCH 022/611] [vrv] add support for series pages --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/vrv.py | 88 ++++++++++++++++++++++-------- 2 files changed, 68 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1b427e256..980333a11 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1182,7 +1182,10 @@ from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE -from .vrv import VRVIE +from .vrv import ( + VRVIE, + VRVSeriesIE, +) from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 33618c951..487047fd7 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -20,22 +20,7 @@ from ..utils import ( ) -class VRVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P[A-Z0-9]+)' - _TEST = { - 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', - 'info_dict': { - 'id': 'GR9PNZ396', - 'ext': 'mp4', - 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', - 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', - 'uploader_id': 'seeso', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } +class VRVBaseIE(InfoExtractor): _API_DOMAIN = None _API_PARAMS = {} _CMS_SIGNING = {} @@ -64,6 +49,8 @@ class VRVIE(InfoExtractor): note='Downloading %s JSON metadata' % note, headers=headers, data=data) def _call_cms(self, path, video_id, note): + if not self._CMS_SIGNING: + self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing'] return self._download_json( self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) @@ -75,9 +62,30 @@ class VRVIE(InfoExtractor): webpage, 'api config'), video_id)['cxApiParams'] self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') - def _set_cms_signing(self, video_id): - if not self._CMS_SIGNING: - self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing'] + def _get_cms_resource(self, resource_key, video_id): + return self._call_api( + 'cms_resource', video_id, 'resource path', data={ + 'resource_key': resource_key, + })['__links__']['cms_resource']['href'] + + +class VRVIE(VRVBaseIE): + IE_NAME = 'vrv' + _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P[A-Z0-9]+)' + _TEST = { + 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', + 'info_dict': { + 'id': 'GR9PNZ396', + 'ext': 'mp4', + 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', + 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', + 'uploader_id': 'seeso', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } def _real_extract(self, url): video_id = self._match_id(url) @@ -91,10 +99,8 @@ class VRVIE(InfoExtractor): video_data = media_resource.get('json') if not video_data: self._set_api_params(webpage, video_id) - episode_path = self._call_api('cms_resource', video_id, 'episode resource path', data={ - 'resource_key': 'cms:/episodes/' + video_id, - })['__links__']['cms_resource']['href'] - self._set_cms_signing(video_id) + episode_path = self._get_cms_resource( + 'cms:/episodes/' + video_id, video_id) video_data = self._call_cms(episode_path, video_id, 'video') title = video_data['title'] @@ -102,7 +108,6 @@ class VRVIE(InfoExtractor): if not streams_json: self._set_api_params(webpage, video_id) streams_path = video_data['__links__']['streams']['href'] - self._set_cms_signing(video_id) streams_json = self._call_cms(streams_path, video_id, 'streams') audio_locale = streams_json.get('audio_locale') @@ -149,3 +154,38 @@ class VRVIE(InfoExtractor): 'episode_number': int_or_none(video_data.get('episode_number')), 'episode_id': video_data.get('production_episode_id'), } + + +class VRVSeriesIE(VRVBaseIE): + IE_NAME = 'vrv:series' + _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P[A-Z0-9]+)' + _TEST = { + 'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider', + 'info_dict': { + 'id': 'G68VXG3G6', + }, + 'playlist_mincount': 11, + } + + def _real_extract(self, url): + series_id = self._match_id(url) + webpage = self._download_webpage( + url, series_id, + headers=self.geo_verification_headers()) + + self._set_api_params(webpage, series_id) + seasons_path = self._get_cms_resource( + 'cms:/seasons?series_id=' + series_id, series_id) + seasons_data = self._call_cms(seasons_path, series_id, 'seasons') + + entries = [] + for season in seasons_data.get('items', []): + episodes_path = season['__links__']['season/episodes']['href'] + episodes = self._call_cms(episodes_path, series_id, 'episodes') + for episode in episodes.get('items', []): + episode_id = episode['id'] + entries.append(self.url_result( + 'https://vrv.co/watch/' + episode_id, + 'VRV', episode_id, episode.get('title'))) + + return self.playlist_result(entries, series_id) From 51342717cddafde83dbf39f2212be40a196a577a Mon Sep 17 00:00:00 2001 From: Timendum Date: Tue, 14 Mar 2017 16:11:09 +0100 Subject: [PATCH 023/611] [rai] Fix extraction --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/rai.py | 359 ++++++++++++++++------------- 2 files changed, 197 insertions(+), 164 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 980333a11..d9e8d53ac 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -802,7 +802,7 @@ from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import ( - RaiTVIE, + RaiPlayIE, RaiIE, ) from .rbmaradio import RBMARadioIE diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 41afbd9af..b67e94f88 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - determine_ext, ExtractorError, + determine_ext, find_xpath_attr, fix_xml_ampersands, int_or_none, @@ -55,7 +55,200 @@ class RaiBaseIE(InfoExtractor): return formats - def _extract_from_content_id(self, content_id, base_url): + +class RaiPlayIE(RaiBaseIE): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.html' + _TESTS = [{ + 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', + 'md5': '340aa3b7afb54bfd14a8c11786450d76', + 'info_dict': { + 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', + 'ext': 'mp4', + 'title': 'La Casa Bianca', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': r're:^Rai.+', + 'description': 're:^[A-Za-z]+' + } + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'md5': 'ed4da3d70ccf8129a33ab16b34d20ab8', + 'info_dict': { + 'id': 'efebe701-969c-4593-92f3-285f0d1ce750', + 'ext': 'mp4', + 'title': 'Gazebo - #gazebotraindesi', + 'thumbnail': r're:^https?://.*\.png$', + 'uploader': r're:^Rai.+', + 'description': r're:^[A-Za-z]+' + } + }, { + 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', + 'ext': 'mp4', + 'title': 'Report - Report del 07/04/2014', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': r're:^Rai.+', + 'description': r're:^[A-Za-z]+' + } + }] + _RESOLUTION = '600x400' + + def _real_extract(self, url): + video_id = self._match_id(url) + + # remove query and fragment part from url + canonical_url = compat_urlparse.urljoin(url, compat_urlparse.urlparse(url).path) + webpage = self._download_webpage(canonical_url, video_id) + + media = self._download_json('%s?json' % canonical_url, + video_id, 'Downloading video JSON') + + thumbnails = [] + if 'images' in media: + for _, value in media.get('images').items(): + if value: + thumbnails.append({ + 'url': value.replace('[RESOLUTION]', self._RESOLUTION) + }) + + if 'video' not in media: + raise ExtractorError('No video found') + + video = media.get('video') + duration = parse_duration(video.get('duration')), + formats = self._extract_relinker_formats(video.get('contentUrl'), video_id) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage).replace(' - video - RaiPlay', ''), + 'description': self._og_search_description(webpage), + 'uploader': media.get('channel'), + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats + } + + +class RaiIE(RaiBaseIE): + _VALID_URL = r'https?://.+\.(?:rai|rainews)\.it/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _TESTS = [{ + # subdomain test case + 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'info_dict': { + 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', + 'ext': 'mp4', + 'title': 'TG PRIMO TEMPO', + 'upload_date': '20140612', + 'duration': 1758, + 'thumbnail': r're:^https?://.*\.jpg$' + } + }, { + # rainews test case + 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', + 'info_dict': { + 'id': '1632c009-c843-4836-bb65-80c33084a64b', + 'ext': 'mp4', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor \"La ragazza del treno\" ', + 'upload_date': '20161103', + 'thumbnail': r're:^https?://.*\.png$', + 'description': r're:^[A-Za-z]+' + } + }, { + # with media information + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', + 'md5': '11959b4e44fa74de47011b5799490adf', + 'info_dict': { + 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 03/11/2016', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20161103', + 'description': r're:^[A-Za-z]+' + } + }, { + # drawMediaRaiTV test case + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', + 'info_dict': { + 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', + 'ext': 'mp4', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'upload_date': '20141221', + }, + }, { + # Direct relinker URL + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', + }, + }, { + # Embedded content item ID + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'd80d4b70-3812-4501-a888-92edec729f00', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'upload_date': r're:\d{8}', + 'description': r're:.+', + }, + }, { + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'mp4', + 'title': 'La diretta di Rainews24', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + iframe_url = self._search_regex( + [r']+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', + r'drawMediaRaiTV\(["\'](.+?)["\']'], + webpage, 'iframe', default=None) + if iframe_url: + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + return self.url_result(iframe_url) + + content_item_id = self._search_regex( + r'initEdizione\((?P[\'"])ContentItem-(?P[^\'"]+)(?P=q1)', + webpage, 'content item ID', group='content_id', default=None) + if content_item_id: + return self._extract_from_content_id(content_item_id, url) + + try: + return self._extract_from_content_id(video_id, url) + except ExtractorError: + # no media data, only direct relinker + pass + + relinker_url = compat_urlparse.urljoin(url, self._search_regex( + r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P[\'"])(?P(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', + webpage, 'relinker URL', group='url')) + formats = self._extract_relinker_formats(relinker_url, video_id) + self._sort_formats(formats) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P[^\'"]+)\1', + webpage, 'title', group='title', default=None) or self._og_search_title(webpage) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } + + def _extract_from_content_id(self, content_id, url): media = self._download_json( 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, content_id, 'Downloading video JSON') @@ -65,7 +258,7 @@ class RaiBaseIE(InfoExtractor): thumbnail_url = media.get(image_type) if thumbnail_url: thumbnails.append({ - 'url': compat_urlparse.urljoin(base_url, thumbnail_url), + 'url': compat_urlparse.urljoin(url, thumbnail_url), }) formats = [] @@ -105,163 +298,3 @@ class RaiBaseIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } - - -class RaiTVIE(RaiBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' - _TESTS = [ - { - 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': '8970abf8caf8aef4696e7b1f2adfc696', - 'info_dict': { - 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'mp4', - 'title': 'Report del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', - 'upload_date': '20140407', - 'duration': 6160, - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - { - # no m3u8 stream - 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - # HDS download, MD5 is unstable - 'info_dict': { - 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', - 'ext': 'flv', - 'title': 'TG PRIMO TEMPO', - 'upload_date': '20140612', - 'duration': 1758, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'skip': 'Geo-restricted to Italy', - }, - { - 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', - 'md5': '35cf7c229f22eeef43e48b5cf923bef0', - 'info_dict': { - 'id': '7aafdea9-0e5d-49d5-88a6-7e65da67ae13', - 'ext': 'mp4', - 'title': 'State of the Net, Antonella La Carpia: regole virali', - 'description': 'md5:b0ba04a324126903e3da7763272ae63c', - 'upload_date': '20140613', - }, - 'skip': 'Error 404', - }, - { - 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', - 'info_dict': { - 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132', - 'ext': 'mp4', - 'title': 'Alluvione in Sardegna e dissesto idrogeologico', - 'description': 'Edizione delle ore 20:30 ', - }, - 'skip': 'invalid urls', - }, - { - 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': 'e57493e1cb8bc7c564663f363b171847', - 'info_dict': { - 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', - 'ext': 'mp4', - 'title': 'Il Candidato - Primo episodio: "Le Primarie"', - 'description': 'md5:364b604f7db50594678f483353164fb8', - 'upload_date': '20140923', - 'duration': 386, - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - - return self._extract_from_content_id(video_id, url) - - -class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' - _TESTS = [ - { - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'upload_date': '20141221', - }, - }, - { - # Direct relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - # HDS live stream, MD5 is unstable - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'skip': 'Geo-restricted to Italy', - }, - { - # Embedded content item ID - 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', - 'md5': '84c1135ce960e8822ae63cec34441d63', - 'info_dict': { - 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8', - 'ext': 'mp4', - 'title': 'TG1 ore 20:00 del 02/07/2016', - 'upload_date': '20160702', - }, - }, - { - 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', - # HDS live stream, MD5 is unstable - 'info_dict': { - 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', - 'ext': 'flv', - 'title': 'La diretta di Rainews24', - }, - }, - ] - - @classmethod - def suitable(cls, url): - return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url) - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - iframe_url = self._search_regex( - [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', - r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe', default=None) - if iframe_url: - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) - - content_item_id = self._search_regex( - r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', - webpage, 'content item ID', group='content_id', default=None) - if content_item_id: - return self._extract_from_content_id(content_item_id, url) - - relinker_url = compat_urlparse.urljoin(url, self._search_regex( - r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', - webpage, 'relinker URL', group='url')) - formats = self._extract_relinker_formats(relinker_url, video_id) - self._sort_formats(formats) - - title = self._search_regex( - r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', - webpage, 'title', group='title', default=None) or self._og_search_title(webpage) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - } From b8d8cced9b55c57f3b09e83972be9d6318a459ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 02:14:42 +0700 Subject: [PATCH 024/611] [rai] Improve extraction (closes #11790) * Fix georestriction detection * Detect live streams + Extract relinker metadata * Improve ContentItem detection + Extract series metadata * Fix tests --- youtube_dl/extractor/rai.py | 359 +++++++++++++++++++++++------------- 1 file changed, 233 insertions(+), 126 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index b67e94f88..b77b0a08e 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,23 +1,40 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_str, +) from ..utils import ( ExtractorError, determine_ext, find_xpath_attr, fix_xml_ampersands, + GeoRestrictedError, int_or_none, parse_duration, + strip_or_none, + try_get, unified_strdate, + unified_timestamp, update_url_query, + urljoin, xpath_text, ) class RaiBaseIE(InfoExtractor): - def _extract_relinker_formats(self, relinker_url, video_id): + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _GEO_COUNTRIES = ['IT'] + _GEO_BYPASS = False + + def _extract_relinker_info(self, relinker_url, video_id): formats = [] + geoprotection = None + is_live = None + duration = None for platform in ('mon', 'flash', 'native'): relinker = self._download_xml( @@ -27,9 +44,27 @@ class RaiBaseIE(InfoExtractor): query={'output': 45, 'pl': platform}, headers=self.geo_verification_headers()) - media_url = find_xpath_attr(relinker, './url', 'type', 'content').text + if not geoprotection: + geoprotection = xpath_text( + relinker, './geoprotection', default=None) == 'Y' + + if not is_live: + is_live = xpath_text( + relinker, './is_live', default=None) == 'Y' + if not duration: + duration = parse_duration(xpath_text( + relinker, './duration', default=None)) + + url_elem = find_xpath_attr(relinker, './url', 'type', 'content') + if url_elem is None: + continue + + media_url = url_elem.text + + # This does not imply geo restriction (e.g. + # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) if media_url == 'http://download.rai.it/video_no_available.mp4': - self.raise_geo_restricted() + continue ext = determine_ext(media_url) if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): @@ -53,11 +88,18 @@ class RaiBaseIE(InfoExtractor): 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', }) - return formats + if not formats and geoprotection is True: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + + return dict((k, v) for k, v in { + 'is_live': is_live, + 'duration': duration, + 'formats': formats, + }.items() if v is not None) class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.html' + _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE _TESTS = [{ 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', 'md5': '340aa3b7afb54bfd14a8c11786450d76', @@ -65,110 +107,130 @@ class RaiPlayIE(RaiBaseIE): 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', 'ext': 'mp4', 'title': 'La Casa Bianca', + 'alt_title': 'S2016 - Puntata del 23/10/2016', + 'description': 'md5:a09d45890850458077d1f68bb036e0a5', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': r're:^Rai.+', - 'description': 're:^[A-Za-z]+' - } - }, { - 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', - 'md5': 'ed4da3d70ccf8129a33ab16b34d20ab8', - 'info_dict': { - 'id': 'efebe701-969c-4593-92f3-285f0d1ce750', - 'ext': 'mp4', - 'title': 'Gazebo - #gazebotraindesi', - 'thumbnail': r're:^https?://.*\.png$', - 'uploader': r're:^Rai.+', - 'description': r're:^[A-Za-z]+' - } + 'uploader': 'Rai 3', + 'creator': 'Rai 3', + 'duration': 3278, + 'timestamp': 1477764300, + 'upload_date': '20161029', + 'series': 'La Casa Bianca', + 'season': '2016', + }, }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', - 'title': 'Report - Report del 07/04/2014', + 'title': 'Report del 07/04/2014', + 'alt_title': 'S2013/14 - Puntata del 07/04/2014', + 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': r're:^Rai.+', - 'description': r're:^[A-Za-z]+' - } + 'uploader': 'Rai 5', + 'creator': 'Rai 5', + 'duration': 6160, + 'series': 'Report', + 'season_number': 5, + 'season': '2013/14', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'only_matching': True, }] - _RESOLUTION = '600x400' def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + url, video_id = mobj.group('url', 'id') - # remove query and fragment part from url - canonical_url = compat_urlparse.urljoin(url, compat_urlparse.urlparse(url).path) - webpage = self._download_webpage(canonical_url, video_id) + media = self._download_json( + '%s?json' % url, video_id, 'Downloading video JSON') - media = self._download_json('%s?json' % canonical_url, - video_id, 'Downloading video JSON') + title = media['name'] + + video = media['video'] + + relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + self._sort_formats(relinker_info['formats']) thumbnails = [] if 'images' in media: for _, value in media.get('images').items(): if value: thumbnails.append({ - 'url': value.replace('[RESOLUTION]', self._RESOLUTION) + 'url': value.replace('[RESOLUTION]', '600x400') }) - if 'video' not in media: - raise ExtractorError('No video found') + timestamp = unified_timestamp(try_get( + media, lambda x: x['availabilities'][0]['start'], compat_str)) - video = media.get('video') - duration = parse_duration(video.get('duration')), - formats = self._extract_relinker_formats(video.get('contentUrl'), video_id) - self._sort_formats(formats) - - return { + info = { 'id': video_id, - 'title': self._og_search_title(webpage).replace(' - video - RaiPlay', ''), - 'description': self._og_search_description(webpage), + 'title': title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description'), 'uploader': media.get('channel'), - 'duration': duration, + 'creator': media.get('editor'), + 'duration': parse_duration(video.get('duration')), + 'timestamp': timestamp, 'thumbnails': thumbnails, - 'formats': formats + 'series': try_get( + media, lambda x: x['isPartOf']['name'], compat_str), + 'season_number': int_or_none(try_get( + media, lambda x: x['isPartOf']['numeroStagioni'])), + 'season': media.get('stagione') or None, } + info.update(relinker_info) + + return info + class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://.+\.(?:rai|rainews)\.it/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ - # subdomain test case + # var uniquename = "ContentItem-..." + # data-id="ContentItem-..." 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'mp4', 'title': 'TG PRIMO TEMPO', - 'upload_date': '20140612', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, - 'thumbnail': r're:^https?://.*\.jpg$' + 'upload_date': '20140612', } }, { - # rainews test case + # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', 'info_dict': { 'id': '1632c009-c843-4836-bb65-80c33084a64b', 'ext': 'mp4', - 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor \"La ragazza del treno\" ', - 'upload_date': '20161103', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', + 'description': 'I film in uscita questa settimana.', 'thumbnail': r're:^https?://.*\.png$', - 'description': r're:^[A-Za-z]+' + 'duration': 833, + 'upload_date': '20161103', } }, { - # with media information + # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', 'md5': '11959b4e44fa74de47011b5799490adf', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', 'title': 'TG1 ore 20:00 del 03/11/2016', + 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2214, 'upload_date': '20161103', - 'description': r're:^[A-Za-z]+' } }, { - # drawMediaRaiTV test case + # drawMediaRaiTV(...) 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', 'md5': '2dd727e61114e1ee9c47f0da6914e178', 'info_dict': { @@ -176,83 +238,67 @@ class RaiIE(RaiBaseIE): 'ext': 'mp4', 'title': 'Il pacco', 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20141221', }, }, { - # Direct relinker URL + # initEdizione('ContentItem-...' + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'duration': 2274, + 'upload_date': '20170401', + }, + 'skip': 'Changes daily', + }, { + # HDS live stream with only relinker URL 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - # HDS live stream, MD5 is unstable 'info_dict': { 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', 'ext': 'flv', 'title': 'EuroNews', }, - }, { - # Embedded content item ID - 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', - 'info_dict': { - 'id': 'd80d4b70-3812-4501-a888-92edec729f00', - 'ext': 'mp4', - 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', - 'upload_date': r're:\d{8}', - 'description': r're:.+', + 'params': { + 'skip_download': True, }, }, { + # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', - # HDS live stream, MD5 is unstable 'info_dict': { 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', 'ext': 'mp4', 'title': 'La diretta di Rainews24', }, + 'params': { + 'skip_download': True, + }, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - iframe_url = self._search_regex( - [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', - r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe', default=None) - if iframe_url: - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) - - content_item_id = self._search_regex( - r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', - webpage, 'content item ID', group='content_id', default=None) - if content_item_id: - return self._extract_from_content_id(content_item_id, url) - - try: - return self._extract_from_content_id(video_id, url) - except ExtractorError: - # no media data, only direct relinker - pass - - relinker_url = compat_urlparse.urljoin(url, self._search_regex( - r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', - webpage, 'relinker URL', group='url')) - formats = self._extract_relinker_formats(relinker_url, video_id) - self._sort_formats(formats) - - title = self._search_regex( - r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', - webpage, 'title', group='title', default=None) or self._og_search_title(webpage) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - } - def _extract_from_content_id(self, content_id, url): media = self._download_json( 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, content_id, 'Downloading video JSON') + title = media['name'].strip() + + media_type = media['type'] + if 'Audio' in media_type: + relinker_info = { + 'formats': { + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + } + } + elif 'Video' in media_type: + relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) + else: + raise ExtractorError('not a media file') + + self._sort_formats(relinker_info['formats']) + thumbnails = [] for image_type in ('image', 'image_medium', 'image_300'): thumbnail_url = media.get(image_type) @@ -261,20 +307,6 @@ class RaiIE(RaiBaseIE): 'url': compat_urlparse.urljoin(url, thumbnail_url), }) - formats = [] - media_type = media['type'] - if 'Audio' in media_type: - formats.append({ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }) - elif 'Video' in media_type: - formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) - self._sort_formats(formats) - else: - raise ExtractorError('not a media file') - subtitles = {} captions = media.get('subtitlesUrl') if captions: @@ -287,14 +319,89 @@ class RaiIE(RaiBaseIE): 'url': captions, }] - return { + info = { 'id': content_id, - 'title': media['name'], - 'description': media.get('desc'), + 'title': title, + 'description': strip_or_none(media.get('desc')), 'thumbnails': thumbnails, 'uploader': media.get('author'), 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(media.get('length')), - 'formats': formats, 'subtitles': subtitles, } + + info.update(relinker_info) + + return info + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + content_item_id = None + + content_item_url = self._html_search_meta( + ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', + 'twitter:player', 'jsonlink'), webpage, default=None) + if content_item_url: + content_item_id = self._search_regex( + r'ContentItem-(%s)' % self._UUID_RE, content_item_url, + 'content item id', default=None) + + if not content_item_id: + content_item_id = self._search_regex( + r'''(?x) + (?: + (?:initEdizione|drawMediaRaiTV)\(| + <(?:[^>]+\bdata-id|var\s+uniquename)= + ) + (["\']) + (?:(?!\1).)*\bContentItem-(?P<id>%s) + ''' % self._UUID_RE, + webpage, 'content item id', default=None, group='id') + + content_item_ids = set() + content_item_ids.add(content_item_id) + if video_id not in content_item_ids: + content_item_ids.add(video_id) + + for content_item_id in content_item_ids: + try: + return self._extract_from_content_id(content_item_id, url) + except GeoRestrictedError: + raise + except ExtractorError: + pass + + relinker_url = self._search_regex( + r'''(?x) + (?: + var\s+videoURL| + mediaInfo\.mediaUri + )\s*=\s* + ([\'"]) + (?P<url> + (?:https?:)? + //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? + (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 + ''', + webpage, 'relinker URL', group='url') + + relinker_info = self._extract_relinker_info( + urljoin(url, relinker_url), video_id) + self._sort_formats(relinker_info['formats']) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + info = { + 'id': video_id, + 'title': title, + } + + info.update(relinker_info) + + return info From 361f293ab85c29ab62cb91577d2be34814d5c552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 02:24:13 +0700 Subject: [PATCH 025/611] [rai] Skip not found content item id --- youtube_dl/extractor/rai.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index b77b0a08e..077546a73 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -362,7 +362,8 @@ class RaiIE(RaiBaseIE): webpage, 'content item id', default=None, group='id') content_item_ids = set() - content_item_ids.add(content_item_id) + if content_item_id: + content_item_ids.add(content_item_id) if video_id not in content_item_ids: content_item_ids.add(video_id) From a76c25146a93052f367a0fb8cdd9a08ba9cef491 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 02:37:18 +0700 Subject: [PATCH 026/611] [ChangeLog] Actualize --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 07725b12a..3ffc647f1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version <unreleased> + +Core +[YoutubeDL] Return early when extraction of url_transparent fails + +Extractors +* [rai] Fix and improve extraction (#11790) ++ [vrv] Add support for series pages +* [limelight] Improve extraction for audio only formats +* [funimation] Fix extraction (#10696, #11773) ++ [xfileshare] Add support for vidabc.com (#12589) ++ [xfileshare] Improve extraction and extract hls formats ++ [crunchyroll] Pass geo verifcation proxy ++ [cwtv] Extract ISM formats ++ [tvplay] Bypass geo restriction ++ [vrv] Add support for vrv.co ++ [packtpub] Add support for packtpub.com (#12610) ++ [generic] Pass base_url to _parse_jwplayer_data ++ [adn] Add support for animedigitalnetwork.fr (#4866) ++ [allocine] Extract more metadata +* [allocine] Fix extraction (#12592) +* [openload] Fix extraction + + version 2017.03.26 Core From b56e41a701d73072b7d62a151b7aafd87955dfe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 02:39:15 +0700 Subject: [PATCH 027/611] release 2017.04.02 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 8 ++++---- docs/supportedsites.md | 9 +++++++-- youtube_dl/version.py | 2 +- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 2f717926c..c1b737619 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.26*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.02*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.03.26 +[debug] youtube-dl version 2017.04.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 3ffc647f1..0199bdf1f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.04.02 Core [YoutubeDL] Return early when extraction of url_transparent fails diff --git a/README.md b/README.md index 86b44781c..41f647aaa 100644 --- a/README.md +++ b/README.md @@ -181,10 +181,10 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo -R, --retries RETRIES Number of retries (default is 10), or "infinite". --fragment-retries RETRIES Number of retries for a fragment (default - is 10), or "infinite" (DASH and hlsnative - only) - --skip-unavailable-fragments Skip unavailable fragments (DASH and - hlsnative only) + is 10), or "infinite" (DASH, hlsnative and + ISM) + --skip-unavailable-fragments Skip unavailable fragments (DASH, hlsnative + and ISM) --abort-on-unavailable-fragment Abort downloading when some fragment is not available --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e9dbc021b..5c1855111 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -28,6 +28,7 @@ - **acast** - **acast:channel** - **AddAnime** + - **ADN**: Anime Digital Network - **AdobeTV** - **AdobeTVChannel** - **AdobeTVShow** @@ -572,6 +573,8 @@ - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek + - **PacktPub** + - **PacktPubCourse** - **PandaTV**: 熊猫TV - **pandora.tv**: 판도라TV - **parliamentlive.tv**: UK parliament videos @@ -629,7 +632,7 @@ - **radiofrance** - **RadioJavan** - **Rai** - - **RaiTV** + - **RaiPlay** - **RBMARadio** - **RDS**: RDS.ca - **RedBullTV** @@ -926,6 +929,8 @@ - **vpro**: npo.nl and ntr.nl - **Vrak** - **VRT** + - **vrv** + - **vrv:series** - **vube**: Vube.com - **VuClip** - **VVVVID** @@ -953,7 +958,7 @@ - **WSJ**: Wall Street Journal - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE + - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC - **XHamster** - **XHamsterEmbed** - **xiami:album**: 虾米音乐 - 专辑 diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 94e8198ec..f612d03ca 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.03.26' +__version__ = '2017.04.02' From b3633fa0ce0f98801582f8e4e348436b0f361eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 03:20:28 +0700 Subject: [PATCH 028/611] [pericope] Add support for pscp.tv URLs --- youtube_dl/extractor/periscope.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 0e3623024..1add6b840 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -20,7 +20,7 @@ class PeriscopeBaseIE(InfoExtractor): class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' - _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', @@ -41,6 +41,9 @@ class PeriscopeIE(PeriscopeBaseIE): }, { 'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX', 'only_matching': True, + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, }] @staticmethod @@ -103,7 +106,7 @@ class PeriscopeIE(PeriscopeBaseIE): class PeriscopeUserIE(PeriscopeBaseIE): - _VALID_URL = r'https?://(?:www\.)?periscope\.tv/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$' IE_DESC = 'Periscope user videos' IE_NAME = 'periscope:user' From 4457823dda410c5406f5ab5474b9b1f9325fa7ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Apr 2017 03:56:49 +0700 Subject: [PATCH 029/611] [extractor/common] Move censorship checks to a separate method and add check for just another ISP --- youtube_dl/extractor/common.py | 48 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6c3c095f7..cdfa7000b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -547,6 +547,34 @@ class InfoExtractor(object): return encoding + def __check_blocked(self, content): + first_block = content[:512] + if ('<title>Access to this site is blocked' in content and + 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'' + PLAYER_REGEX = r'