diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7bd301cc8..6f1361b32 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.11*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.11** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.17*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.17** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.02.11 +[debug] youtube-dl version 2017.02.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.travis.yml b/.travis.yml index 4833c76e9..f41e11137 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,10 @@ python: - "3.5" - "3.6" sudo: false -script: nosetests test --verbose +env: + - YTDL_TEST_SET=core + - YTDL_TEST_SET=download +script: ./devscripts/run_tests.sh notifications: email: - filippo.valsorda@gmail.com diff --git a/ChangeLog b/ChangeLog index cba47a296..2c90f791d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,51 @@ -version +version 2017.02.17 + +Extractors +* [heise] Improve extraction (#9725) +* [ellentv] Improve (#11653) +* [openload] Fix extraction (#10408, #12002) ++ [theplatform] Recognize URLs with whitespaces (#12044) +* [einthusan] Relax URL regular expression (#12141, #12159) ++ [generic] Support complex JWPlayer embedded videos (#12030) +* [elpais] Improve extraction (#12139) + + +version 2017.02.16 + +Core ++ [utils] Add support for quoted string literals in --match-filter (#8050, + #12142, #12144) + +Extractors +* [ceskatelevize] Lower priority for audio description sources (#12119) +* [amcnetworks] Fix extraction (#12127) +* [pinkbike] Fix uploader extraction (#12054) ++ [onetpl] Add support for businessinsider.com.pl and plejada.pl ++ [onetpl] Add support for onet.pl (#10507) ++ [onetmvp] Add shortcut extractor ++ [vodpl] Add support for vod.pl (#12122) ++ [pornhub] Extract video URL from tv platform site (#12007, #12129) ++ [ceskatelevize] Extract DASH formats (#12119, #12133) + + +version 2017.02.14 Core * TypeError is fixed with Python 2.7.13 on Windows (#11540, #12085) +Extractor +* [zdf] Fix extraction (#12117) +* [xtube] Fix extraction for both kinds of video id (#12088) +* [xtube] Improve title extraction (#12088) ++ [lemonde] Fallback delegate extraction to generic extractor (#12115, #12116) +* [bellmedia] Allow video id longer than 6 characters (#12114) ++ [limelight] Add support for referer protected videos +* [disney] Improve extraction (#4975, #11000, #11882, #11936) +* [hotstar] Improve extraction (#12096) +* [einthusan] Fix extraction (#11416) ++ [aenetworks] Add support for lifetimemovieclub.com (#12097) +* [youtube] Fix parsing codecs (#12091) + version 2017.02.11 diff --git a/README.md b/README.md index 89876bd7a..c2a1a6b02 100644 --- a/README.md +++ b/README.md @@ -137,13 +137,13 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --match-filter FILTER Generic video filter. Specify any key (see help for -o for a list of available keys) to match if the key is present, !key to - check if the key is not present,key > + check if the key is not present, key > NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, =) to compare against a number, and & to require multiple matches. Values which are not known are excluded unless you put a question mark (?) - after the operator.For example, to only + after the operator. For example, to only match videos that have been liked more than 100 times and disliked less than 50 times (or the dislike functionality is not diff --git a/devscripts/run_tests.sh b/devscripts/run_tests.sh new file mode 100755 index 000000000..c60807215 --- /dev/null +++ b/devscripts/run_tests.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +DOWNLOAD_TESTS="age_restriction|download|subtitles|write_annotations|iqiyi_sdk_interpreter" + +test_set="" +multiprocess_args="" + +case "$YTDL_TEST_SET" in + core) + test_set="-I test_($DOWNLOAD_TESTS)\.py" + ;; + download) + test_set="-I test_(?!$DOWNLOAD_TESTS).+\.py" + multiprocess_args="--processes=4 --process-timeout=540" + ;; + *) + break + ;; +esac + +nosetests test --verbose $test_set $multiprocess_args diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3e84f1237..5a436e8f7 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -546,8 +546,10 @@ - **OktoberfestTV** - **on.aol.com** - **OnDemandKorea** + - **onet.pl** - **onet.tv** - **onet.tv:channel** + - **OnetMVP** - **OnionStudios** - **Ooyala** - **OoyalaExternal** @@ -900,6 +902,7 @@ - **vlive** - **vlive:channel** - **Vodlocker** + - **VODPl** - **VODPlatform** - **VoiceRepublic** - **VoxMedia** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 8bf00bea9..2cfcf743a 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding: utf-8 from __future__ import unicode_literals @@ -540,10 +541,10 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(ydl._format_note({}), '') assertRegexpMatches(self, ydl._format_note({ 'vbr': 10, - }), '^\s*10k$') + }), r'^\s*10k$') assertRegexpMatches(self, ydl._format_note({ 'fps': 30, - }), '^30fps$') + }), r'^30fps$') def test_postprocessors(self): filename = 'post-processor-testfile.mp4' @@ -606,6 +607,8 @@ class TestYoutubeDL(unittest.TestCase): 'duration': 30, 'filesize': 10 * 1024, 'playlist_id': '42', + 'uploader': "變態妍字幕版 太妍 тест", + 'creator': "тест ' 123 ' тест--", } second = { 'id': '2', @@ -616,6 +619,7 @@ class TestYoutubeDL(unittest.TestCase): 'description': 'foo', 'filesize': 5 * 1024, 'playlist_id': '43', + 'uploader': "тест 123", } videos = [first, second] @@ -656,6 +660,26 @@ class TestYoutubeDL(unittest.TestCase): res = get_videos(f) self.assertEqual(res, ['1']) + f = match_filter_func('uploader = "變態妍字幕版 太妍 тест"') + res = get_videos(f) + self.assertEqual(res, ['1']) + + f = match_filter_func('uploader != "變態妍字幕版 太妍 тест"') + res = get_videos(f) + self.assertEqual(res, ['2']) + + f = match_filter_func('creator = "тест \' 123 \' тест--"') + res = get_videos(f) + self.assertEqual(res, ['1']) + + f = match_filter_func("creator = 'тест \\' 123 \\' тест--'") + res = get_videos(f) + self.assertEqual(res, ['1']) + + f = match_filter_func(r"creator = 'тест \' 123 \' тест--' & duration > 30") + res = get_videos(f) + self.assertEqual(res, []) + def test_playlist_items_selection(self): entries = [{ 'id': compat_str(i), diff --git a/test/test_download.py b/test/test_download.py index 463952989..30034f978 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -65,6 +65,10 @@ defs = gettestcases() class TestDownload(unittest.TestCase): + # Parallel testing in nosetests. See + # http://nose.readthedocs.org/en/latest/doc_tests/test_multiprocess/multiprocess.html + _multiprocess_shared_ = True + maxDiff = None def setUp(self): @@ -73,7 +77,7 @@ class TestDownload(unittest.TestCase): # Dynamically generate tests -def generator(test_case): +def generator(test_case, tname): def test_template(self): ie = youtube_dl.extractor.get_info_extractor(test_case['name']) @@ -102,6 +106,7 @@ def generator(test_case): return params = get_params(test_case.get('params', {})) + params['outtmpl'] = tname + '_' + params['outtmpl'] if is_playlist and 'playlist' not in test_case: params.setdefault('extract_flat', 'in_playlist') params.setdefault('skip_download', True) @@ -146,7 +151,7 @@ def generator(test_case): raise if try_num == RETRIES: - report_warning('Failed due to network errors, skipping...') + report_warning('%s failed due to network errors, skipping...' % tname) return print('Retrying: {0} failed tries\n\n##########\n\n'.format(try_num)) @@ -221,12 +226,12 @@ def generator(test_case): # And add them to TestDownload for n, test_case in enumerate(defs): - test_method = generator(test_case) tname = 'test_' + str(test_case['name']) i = 1 while hasattr(TestDownload, tname): tname = 'test_%s_%d' % (test_case['name'], i) i += 1 + test_method = generator(test_case, tname) test_method.__name__ = str(tname) setattr(TestDownload, test_method.__name__, test_method) del test_method diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a7bf5a1b0..68000dea2 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -56,6 +56,8 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, + GeoRestrictedError, + ISO3166Utils, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -272,6 +274,12 @@ class YoutubeDL(object): If it returns None, the video is downloaded. match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. + geo_bypass: Bypass geographic restriction via faking X-Forwarded-For + HTTP header (experimental) + geo_bypass_country: + Two-letter ISO 3166-2 country code that will be used for + explicit geographic restriction bypassing via faking + X-Forwarded-For HTTP header (experimental) The following options determine which downloader is picked: external_downloader: Executable of the external downloader to call. @@ -707,6 +715,14 @@ class YoutubeDL(object): return self.process_ie_result(ie_result, download, extra_info) else: return ie_result + except GeoRestrictedError as e: + msg = e.msg + if e.countries: + msg += '\nThis video is available in %s.' % ', '.join( + map(ISO3166Utils.short2full, e.countries)) + msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' + self.report_error(msg) + break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) break @@ -847,8 +863,14 @@ class YoutubeDL(object): if self.params.get('playlistrandom', False): random.shuffle(entries) + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for extra = { 'n_entries': n_entries, 'playlist': playlist, @@ -1233,6 +1255,11 @@ class YoutubeDL(object): if cookies: res['Cookie'] = cookies + if 'X-Forwarded-For' not in res: + x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') + if x_forwarded_for_ip: + res['X-Forwarded-For'] = x_forwarded_for_ip + return res def _calc_cookies(self, info_dict): @@ -1375,6 +1402,9 @@ class YoutubeDL(object): full_format_info = info_dict.copy() full_format_info.update(format) format['http_headers'] = self._calc_headers(full_format_info) + # Remove private housekeeping stuff + if '__x_forwarded_for_ip' in info_dict: + del info_dict['__x_forwarded_for_ip'] # TODO Central sorting goes here diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5c5b8094b..f91d29a7b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -414,6 +414,8 @@ def _real_main(argv=None): 'cn_verification_proxy': opts.cn_verification_proxy, 'geo_verification_proxy': opts.geo_verification_proxy, 'config_location': opts.config_location, + 'geo_bypass': opts.geo_bypass, + 'geo_bypass_country': opts.geo_bypass_country, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8437dde30..e2ddc369e 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -43,7 +43,10 @@ class DashSegmentsFD(FragmentFD): count = 0 while count <= fragment_retries: try: - success = ctx['dl'].download(target_filename, {'url': segment_url}) + success = ctx['dl'].download(target_filename, { + 'url': segment_url, + 'http_headers': info_dict.get('http_headers'), + }) if not success: return False down, target_sanitized = sanitize_open(target_filename, 'rb') diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py index 93cac5e98..63a636cb7 100644 --- a/youtube_dl/downloader/ism.py +++ b/youtube_dl/downloader/ism.py @@ -238,7 +238,10 @@ class IsmFD(FragmentFD): count = 0 while count <= fragment_retries: try: - success = ctx['dl'].download(target_filename, {'url': segment_url}) + success = ctx['dl'].download(target_filename, { + 'url': segment_url, + 'http_headers': info_dict.get('http_headers'), + }) if not success: return False down, target_sanitized = sanitize_open(target_filename, 'rb') diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index c97317400..dd96a47ce 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -23,7 +23,7 @@ class AENetworksBaseIE(ThePlatformIE): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?P(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P[^/]+(?:/[^/]+){0,2})|movies/(?P[^/]+)/full-movie)' + _VALID_URL = r'https?://(?:www\.)?(?P(?:history|aetv|mylifetime|lifetimemovieclub)\.com|fyi\.tv)/(?:shows/(?P[^/]+(?:/[^/]+){0,2})|movies/(?P[^/]+)(?:/full-movie)?)' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'md5': 'a97a65f7e823ae10e9244bc5433d5fe6', @@ -62,11 +62,15 @@ class AENetworksIE(AENetworksBaseIE): }, { 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True + }, { + 'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us', + 'only_matching': True }] _DOMAIN_TO_REQUESTOR_ID = { 'history.com': 'HISTORY', 'aetv.com': 'AETV', 'mylifetime.com': 'LIFETIME', + 'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB', 'fyi.tv': 'FYI', } diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index 87c803e94..b71d1a093 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -53,20 +53,30 @@ class AMCNetworksIE(ThePlatformIE): 'mbr': 'true', 'manifest': 'm3u', } - media_url = self._search_regex(r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', webpage, 'media url') + media_url = self._search_regex( + r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', + webpage, 'media url') theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), display_id) + r'link\.theplatform\.com/s/([^?]+)', + media_url, 'theplatform_path'), display_id) info = self._parse_theplatform_metadata(theplatform_metadata) video_id = theplatform_metadata['pid'] title = theplatform_metadata['title'] rating = theplatform_metadata['ratings'][0]['rating'] - auth_required = self._search_regex(r'window\.authRequired\s*=\s*(true|false);', webpage, 'auth required') + auth_required = self._search_regex( + r'window\.authRequired\s*=\s*(true|false);', + webpage, 'auth required') if auth_required == 'true': - requestor_id = self._search_regex(r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', webpage, 'requestor id') - resource = self._get_mvpd_resource(requestor_id, title, video_id, rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, requestor_id, resource) + requestor_id = self._search_regex( + r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', + webpage, 'requestor id') + resource = self._get_mvpd_resource( + requestor_id, title, video_id, rating) + query['auth'] = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) media_url = update_url_query(media_url, query) - formats, subtitles = self._extract_theplatform_smil(media_url, video_id) + formats, subtitles = self._extract_theplatform_smil( + media_url, video_id) self._sort_formats(formats) info.update({ 'id': video_id, @@ -78,9 +88,11 @@ class AMCNetworksIE(ThePlatformIE): if ns_keys: ns = list(ns_keys)[0] series = theplatform_metadata.get(ns + '$show') - season_number = int_or_none(theplatform_metadata.get(ns + '$season')) + season_number = int_or_none( + theplatform_metadata.get(ns + '$season')) episode = theplatform_metadata.get(ns + '$episodeTitle') - episode_number = int_or_none(theplatform_metadata.get(ns + '$episode')) + episode_number = int_or_none( + theplatform_metadata.get(ns + '$episode')) if season_number: title = 'Season %d - %s' % (season_number, title) if series: diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 486dff82d..e21045bed 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -1,13 +1,13 @@ from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( unified_strdate, clean_html, ) -class ArchiveOrgIE(JWPlatformBaseIE): +class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#]+)(?:[?].*)?$' diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 32326ed9e..1f5b6ed92 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -24,7 +24,7 @@ class BellMediaIE(InfoExtractor): space )\.ca| much\.com - )/.*?(?:\bvid=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6})''' + )/.*?(?:\bvid=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' _TESTS = [{ 'url': 'http://www.ctv.ca/video/player?vid=706966', 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', @@ -55,6 +55,9 @@ class BellMediaIE(InfoExtractor): }, { 'url': 'http://www.much.com/shows/the-almost-impossible-gameshow/928979/episode-6', 'only_matching': True, + }, { + 'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430', + 'only_matching': True, }] _DOMAINS = { 'thecomedynetwork': 'comedy', diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 5c6e99da1..27685eed0 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -191,6 +191,10 @@ class BrightcoveLegacyIE(InfoExtractor): # These fields hold the id of the video videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') if videoPlayer is not None: + if isinstance(videoPlayer, list): + videoPlayer = videoPlayer[0] + if not (videoPlayer.isdigit() or videoPlayer.startswith('ref:')): + return None params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') if linkBase is not None: diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 4f88c31ad..b1dfacf80 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -13,6 +13,7 @@ from ..utils import ( float_or_none, sanitized_Request, urlencode_postdata, + USER_AGENTS, ) @@ -21,10 +22,10 @@ class CeskaTelevizeIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', 'info_dict': { - 'id': '61924494876951776', + 'id': '61924494877246241', 'ext': 'mp4', - 'title': 'Hyde Park Civilizace', - 'description': 'md5:fe93f6eda372d150759d11644ebbfb4a', + 'title': 'Hyde Park Civilizace: Život v Grónsku', + 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 3350, }, @@ -114,70 +115,100 @@ class CeskaTelevizeIE(InfoExtractor): 'requestSource': 'iVysilani', } - req = sanitized_Request( - 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', - data=urlencode_postdata(data)) - - req.add_header('Content-type', 'application/x-www-form-urlencoded') - req.add_header('x-addr', '127.0.0.1') - req.add_header('X-Requested-With', 'XMLHttpRequest') - req.add_header('Referer', url) - - playlistpage = self._download_json(req, playlist_id) - - playlist_url = playlistpage['url'] - if playlist_url == 'error_region': - raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - - req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) - req.add_header('Referer', url) - - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - - playlist = self._download_json(req, playlist_id)['playlist'] - playlist_len = len(playlist) - entries = [] - for item in playlist: - is_live = item.get('type') == 'LIVE' - formats = [] - for format_id, stream_url in item['streamUrls'].items(): - formats.extend(self._extract_m3u8_formats( - stream_url, playlist_id, 'mp4', - entry_protocol='m3u8' if is_live else 'm3u8_native', - fatal=False)) - self._sort_formats(formats) - item_id = item.get('id') or item['assetId'] - title = item['title'] + for user_agent in (None, USER_AGENTS['Safari']): + req = sanitized_Request( + 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + data=urlencode_postdata(data)) - duration = float_or_none(item.get('duration')) - thumbnail = item.get('previewImageUrl') + req.add_header('Content-type', 'application/x-www-form-urlencoded') + req.add_header('x-addr', '127.0.0.1') + req.add_header('X-Requested-With', 'XMLHttpRequest') + if user_agent: + req.add_header('User-Agent', user_agent) + req.add_header('Referer', url) - subtitles = {} - if item.get('type') == 'VOD': - subs = item.get('subtitles') - if subs: - subtitles = self.extract_subtitles(episode_id, subs) + playlistpage = self._download_json(req, playlist_id, fatal=False) - if playlist_len == 1: - final_title = playlist_title or title - if is_live: - final_title = self._live_title(final_title) - else: - final_title = '%s (%s)' % (playlist_title, title) + if not playlistpage: + continue - entries.append({ - 'id': item_id, - 'title': final_title, - 'description': playlist_description if playlist_len == 1 else None, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, - }) + playlist_url = playlistpage['url'] + if playlist_url == 'error_region': + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) + req.add_header('Referer', url) + + playlist_title = self._og_search_title(webpage, default=None) + playlist_description = self._og_search_description(webpage, default=None) + + playlist = self._download_json(req, playlist_id, fatal=False) + if not playlist: + continue + + playlist = playlist.get('playlist') + if not isinstance(playlist, list): + continue + + playlist_len = len(playlist) + + for num, item in enumerate(playlist): + is_live = item.get('type') == 'LIVE' + formats = [] + for format_id, stream_url in item.get('streamUrls', {}).items(): + if 'playerType=flash' in stream_url: + stream_formats = self._extract_m3u8_formats( + stream_url, playlist_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls-%s' % format_id, fatal=False) + else: + stream_formats = self._extract_mpd_formats( + stream_url, playlist_id, + mpd_id='dash-%s' % format_id, fatal=False) + # See https://github.com/rg3/youtube-dl/issues/12119#issuecomment-280037031 + if format_id == 'audioDescription': + for f in stream_formats: + f['source_preference'] = -10 + formats.extend(stream_formats) + + if user_agent and len(entries) == playlist_len: + entries[num]['formats'].extend(formats) + continue + + item_id = item.get('id') or item['assetId'] + title = item['title'] + + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + subtitles = {} + if item.get('type') == 'VOD': + subs = item.get('subtitles') + if subs: + subtitles = self.extract_subtitles(episode_id, subs) + + if playlist_len == 1: + final_title = playlist_title or title + if is_live: + final_title = self._live_title(final_title) + else: + final_title = '%s (%s)' % (playlist_title, title) + + entries.append({ + 'id': item_id, + 'title': final_title, + 'description': playlist_description if playlist_len == 1 else None, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + }) + + for e in entries: + self._sort_formats(e['formats']) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9681453ca..272da74b6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -6,6 +6,7 @@ import hashlib import json import netrc import os +import random import re import socket import sys @@ -39,7 +40,10 @@ from ..utils import ( ExtractorError, fix_xml_ampersands, float_or_none, + GeoRestrictedError, + GeoUtils, int_or_none, + js_to_json, parse_iso8601, RegexNotFoundError, sanitize_filename, @@ -319,17 +323,31 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. + _GEO_BYPASS attribute may be set to False in order to disable + geo restriction bypass mechanisms for a particular extractor. + Though it won't disable explicit geo restriction bypass based on + country code provided with geo_bypass_country. (experimental) + + _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted + countries for this extractor. One of these countries will be used by + geo restriction bypass mechanism right away in order to bypass + geo restriction, of course, if the mechanism is not disabled. (experimental) + Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ _ready = False _downloader = None + _x_forwarded_for_ip = None + _GEO_BYPASS = True + _GEO_COUNTRIES = None _WORKING = True def __init__(self, downloader=None): """Constructor. Receives an optional downloader.""" self._ready = False + self._x_forwarded_for_ip = None self.set_downloader(downloader) @classmethod @@ -358,15 +376,42 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" + self.__initialize_geo_bypass() if not self._ready: self._real_initialize() self._ready = True + def __initialize_geo_bypass(self): + if not self._x_forwarded_for_ip: + country_code = self._downloader.params.get('geo_bypass_country', None) + # If there is no explicit country for geo bypass specified and + # the extractor is known to be geo restricted let's fake IP + # as X-Forwarded-For right away. + if (not country_code and + self._GEO_BYPASS and + self._downloader.params.get('geo_bypass', True) and + self._GEO_COUNTRIES): + country_code = random.choice(self._GEO_COUNTRIES) + if country_code: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + if self._downloader.params.get('verbose', False): + self._downloader.to_stdout( + '[debug] Using fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) + def extract(self, url): """Extracts URL information and returns it in list of dicts.""" try: - self.initialize() - return self._real_extract(url) + for _ in range(2): + try: + self.initialize() + ie_result = self._real_extract(url) + if self._x_forwarded_for_ip: + ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip + return ie_result + except GeoRestrictedError as e: + if self.__maybe_fake_ip_and_retry(e.countries): + continue + raise except ExtractorError: raise except compat_http_client.IncompleteRead as e: @@ -374,6 +419,19 @@ class InfoExtractor(object): except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e) + def __maybe_fake_ip_and_retry(self, countries): + if (not self._downloader.params.get('geo_bypass_country', None) and + self._GEO_BYPASS and + self._downloader.params.get('geo_bypass', True) and + not self._x_forwarded_for_ip and + countries): + self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(countries)) + if self._x_forwarded_for_ip: + self.report_warning( + 'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip) + return True + return False + def set_downloader(self, downloader): """Sets the downloader for this IE.""" self._downloader = downloader @@ -433,6 +491,15 @@ class InfoExtractor(object): if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal @@ -608,10 +675,8 @@ class InfoExtractor(object): expected=True) @staticmethod - def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): - raise ExtractorError( - '%s. You might want to use --proxy to workaround.' % msg, - expected=True) + def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None): + raise GeoRestrictedError(msg, countries=countries) # Methods for following #608 @staticmethod @@ -2073,6 +2138,123 @@ class InfoExtractor(object): }) return formats + @staticmethod + def _find_jwplayer_data(webpage): + mobj = re.search( + r'jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P[^)]+)\)', + webpage) + if mobj: + return mobj.group('options') + + def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): + jwplayer_data = self._parse_json( + self._find_jwplayer_data(webpage), video_id, + transform_source=js_to_json) + return self._parse_jwplayer_data( + jwplayer_data, video_id, *args, **kwargs) + + def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + # JWPlayer backward compatibility: flattened playlists + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 + if 'playlist' not in jwplayer_data: + jwplayer_data = {'playlist': [jwplayer_data]} + + entries = [] + + # JWPlayer backward compatibility: single playlist item + # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 + if not isinstance(jwplayer_data['playlist'], list): + jwplayer_data['playlist'] = [jwplayer_data['playlist']] + + for video_data in jwplayer_data['playlist']: + # JWPlayer backward compatibility: flattened sources + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 + if 'sources' not in video_data: + video_data['sources'] = [video_data] + + this_video_id = video_id or video_data['mediaid'] + + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + source_url, this_video_id, mpd_id=mpd_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, + }) + else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like 1080p. + height = int_or_none(self._search_regex( + r'^(\d{3,})[pP]$', source.get('label') or '', + 'height', default=None)) + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': height, + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv' + + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + self._sort_formats(formats) + + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if track.get('kind') != 'captions': + continue + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url) + }) + + entries.append({ + 'id': this_video_id, + 'title': video_data['title'] if require_title else video_data.get('title'), + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), + 'subtitles': subtitles, + 'formats': formats, + }) + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries) + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() diff --git a/youtube_dl/extractor/commonmistakes.py b/youtube_dl/extractor/commonmistakes.py index d3ed4a9a4..79f7a9cd1 100644 --- a/youtube_dl/extractor/commonmistakes.py +++ b/youtube_dl/extractor/commonmistakes.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import sys + from .common import InfoExtractor from ..utils import ExtractorError @@ -33,7 +35,9 @@ class UnicodeBOMIE(InfoExtractor): IE_DESC = False _VALID_URL = r'(?P\ufeff)(?P.*)$' - _TESTS = [{ + # Disable test for python 3.2 since BOM is broken in re in this version + # (see https://github.com/rg3/youtube-dl/issues/9751) + _TESTS = [] if (3, 0) < sys.version_info <= (3, 3) else [{ 'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc', 'only_matching': True, }] diff --git a/youtube_dl/extractor/disney.py b/youtube_dl/extractor/disney.py index 396873c6d..939d1338c 100644 --- a/youtube_dl/extractor/disney.py +++ b/youtube_dl/extractor/disney.py @@ -9,13 +9,15 @@ from ..utils import ( unified_strdate, compat_str, determine_ext, + ExtractorError, ) class DisneyIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?P(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr)|starwars\.com))/(?:embed/|(?:[^/]+/)+[\w-]+-)(?P[a-z0-9]{24})''' + https?://(?P(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P[a-z0-9]{24})|(?:[^/]+/)?(?P[^/?#]+))''' _TESTS = [{ + # Disney.EmbedVideo 'url': 'http://video.disney.com/watch/moana-trailer-545ed1857afee5a0ec239977', 'info_dict': { 'id': '545ed1857afee5a0ec239977', @@ -28,6 +30,20 @@ class DisneyIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + # Grill.burger + 'url': 'http://www.starwars.com/video/rogue-one-a-star-wars-story-intro-featurette', + 'info_dict': { + 'id': '5454e9f4e9804a552e3524c8', + 'ext': 'mp4', + 'title': '"Intro" Featurette: Rogue One: A Star Wars Story', + 'upload_date': '20170104', + 'description': 'Go behind-the-scenes of Rogue One: A Star Wars Story in this featurette with Director Gareth Edwards and the cast of the film.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }, { 'url': 'http://videos.disneylatino.com/ver/spider-man-de-regreso-a-casa-primer-adelanto-543a33a1850bdcfcca13bae2', 'only_matching': True, @@ -43,31 +59,55 @@ class DisneyIE(InfoExtractor): }, { 'url': 'http://www.starwars.com/embed/54690d1e6c42e5f09a0fb097', 'only_matching': True, + }, { + 'url': 'http://spiderman.marvelkids.com/embed/522900d2ced3c565e4cc0677', + 'only_matching': True, + }, { + 'url': 'http://spiderman.marvelkids.com/videos/contest-of-champions-part-four-clip-1', + 'only_matching': True, + }, { + 'url': 'http://disneyjunior.en.disneyme.com/dj/watch-my-friends-tigger-and-pooh-promo', + 'only_matching': True, + }, { + 'url': 'http://disneyjunior.disney.com/galactech-the-galactech-grab-galactech-an-admiral-rescue', + 'only_matching': True, }] def _real_extract(self, url): - domain, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage( - 'http://%s/embed/%s' % (domain, video_id), video_id) - video_data = self._parse_json(self._search_regex( - r'Disney\.EmbedVideo=({.+});', webpage, 'embed data'), video_id)['video'] + domain, video_id, display_id = re.match(self._VALID_URL, url).groups() + if not video_id: + webpage = self._download_webpage(url, display_id) + grill = re.sub(r'"\s*\+\s*"', '', self._search_regex( + r'Grill\.burger\s*=\s*({.+})\s*:', + webpage, 'grill data')) + page_data = next(s for s in self._parse_json(grill, display_id)['stack'] if s.get('type') == 'video') + video_data = page_data['data'][0] + else: + webpage = self._download_webpage( + 'http://%s/embed/%s' % (domain, video_id), video_id) + page_data = self._parse_json(self._search_regex( + r'Disney\.EmbedVideo\s*=\s*({.+});', + webpage, 'embed data'), video_id) + video_data = page_data['video'] for external in video_data.get('externals', []): if external.get('source') == 'vevo': return self.url_result('vevo:' + external['data_id'], 'Vevo') + video_id = video_data['id'] title = video_data['title'] formats = [] for flavor in video_data.get('flavors', []): flavor_format = flavor.get('format') flavor_url = flavor.get('url') - if not flavor_url or not re.match(r'https?://', flavor_url): + if not flavor_url or not re.match(r'https?://', flavor_url) or flavor_format == 'mp4_access': continue tbr = int_or_none(flavor.get('bitrate')) if tbr == 99999: formats.extend(self._extract_m3u8_formats( - flavor_url, video_id, 'mp4', m3u8_id=flavor_format, fatal=False)) + flavor_url, video_id, 'mp4', + m3u8_id=flavor_format, fatal=False)) continue format_id = [] if flavor_format: @@ -88,6 +128,10 @@ class DisneyIE(InfoExtractor): 'ext': ext, 'vcodec': 'none' if (width == 0 and height == 0) else None, }) + if not formats and video_data.get('expired'): + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, page_data['translations']['video_expired']), + expected=True) self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index bcd9fe2a0..e7abc8889 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -20,6 +20,7 @@ from ..utils import ( class DramaFeverBaseIE(AMPIE): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' + _GEO_COUNTRIES = ['US', 'CA'] _CONSUMER_SECRET = 'DA59dtVXYLxajktV' @@ -116,8 +117,9 @@ class DramaFeverIE(DramaFeverBaseIE): 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): - raise ExtractorError( - 'Currently unavailable in your country.', expected=True) + self.raise_geo_restricted( + msg='Currently unavailable in your country', + countries=self._GEO_COUNTRIES) raise series_id, episode_number = video_id.split('.') diff --git a/youtube_dl/extractor/einthusan.py b/youtube_dl/extractor/einthusan.py index 6ca07a13d..3f6268637 100644 --- a/youtube_dl/extractor/einthusan.py +++ b/youtube_dl/extractor/einthusan.py @@ -1,67 +1,97 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import json + from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_str, +) from ..utils import ( - remove_start, - sanitized_Request, + extract_attributes, + ExtractorError, + get_elements_by_class, + urlencode_postdata, ) class EinthusanIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?einthusan\.com/movies/watch.php\?([^#]*?)id=(?P[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.einthusan.com/movies/watch.php?id=2447', - 'md5': 'd71379996ff5b7f217eca034c34e3461', - 'info_dict': { - 'id': '2447', - 'ext': 'mp4', - 'title': 'Ek Villain', - 'thumbnail': r're:^https?://.*\.jpg$', - 'description': 'md5:9d29fc91a7abadd4591fb862fa560d93', - } - }, - { - 'url': 'http://www.einthusan.com/movies/watch.php?id=1671', - 'md5': 'b16a6fd3c67c06eb7c79c8a8615f4213', - 'info_dict': { - 'id': '1671', - 'ext': 'mp4', - 'title': 'Soodhu Kavvuum', - 'thumbnail': r're:^https?://.*\.jpg$', - 'description': 'md5:b40f2bf7320b4f9414f3780817b2af8c', - } - }, - ] + _VALID_URL = r'https?://einthusan\.tv/movie/watch/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://einthusan.tv/movie/watch/9097/', + 'md5': 'ff0f7f2065031b8a2cf13a933731c035', + 'info_dict': { + 'id': '9097', + 'ext': 'mp4', + 'title': 'Ae Dil Hai Mushkil', + 'description': 'md5:33ef934c82a671a94652a9b4e54d931b', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://einthusan.tv/movie/watch/51MZ/?lang=hindi', + 'only_matching': True, + }] + + # reversed from jsoncrypto.prototype.decrypt() in einthusan-PGMovieWatcher.js + def _decrypt(self, encrypted_data, video_id): + return self._parse_json(base64.b64decode(( + encrypted_data[:10] + encrypted_data[-1] + encrypted_data[12:-1] + ).encode('ascii')).decode('utf-8'), video_id) def _real_extract(self, url): video_id = self._match_id(url) - request = sanitized_Request(url) - request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0') - webpage = self._download_webpage(request, video_id) + webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'

]+class=["\']movie-title["\'][^>]*>(.+?)

', - webpage, 'title') + title = self._html_search_regex(r'

([^<]+)

', webpage, 'title') - video_id = self._search_regex( - r'data-movieid=["\'](\d+)', webpage, 'video id', default=video_id) + player_params = extract_attributes(self._search_regex( + r'(]+id="UIVideoPlayer"[^>]+>)', webpage, 'player parameters')) - m3u8_url = self._download_webpage( - 'http://cdn.einthusan.com/geturl/%s/hd/London,Washington,Toronto,Dallas,San,Sydney/' - % video_id, video_id, headers={'Referer': url}) - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native') + page_id = self._html_search_regex( + ']+data-pageid="([^"]+)"', webpage, 'page ID') + video_data = self._download_json( + 'https://einthusan.tv/ajax/movie/watch/%s/' % video_id, video_id, + data=urlencode_postdata({ + 'xEvent': 'UIVideoPlayer.PingOutcome', + 'xJson': json.dumps({ + 'EJOutcomes': player_params['data-ejpingables'], + 'NativeHLS': False + }), + 'arcVersion': 3, + 'appVersion': 59, + 'gorilla.csrf.Token': page_id, + }))['Data'] - description = self._html_search_meta('description', webpage) + if isinstance(video_data, compat_str) and video_data.startswith('/ratelimited/'): + raise ExtractorError( + 'Download rate reached. Please try again later.', expected=True) + + ej_links = self._decrypt(video_data['EJLinks'], video_id) + + formats = [] + + m3u8_url = ej_links.get('HLSLink') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native')) + + mp4_url = ej_links.get('MP4Link') + if mp4_url: + formats.append({ + 'url': mp4_url, + }) + + self._sort_formats(formats) + + description = get_elements_by_class('synopsis', webpage)[0] thumbnail = self._html_search_regex( - r'''''', - webpage, "thumbnail url", fatal=False) + r''']+src=(["'])(?P(?!\1).+?/moviecovers/(?!\1).+?)\1''', + webpage, 'thumbnail url', fatal=False, group='url') if thumbnail is not None: - thumbnail = compat_urlparse.urljoin(url, remove_start(thumbnail, '..')) + thumbnail = compat_urlparse.urljoin(url, thumbnail) return { 'id': video_id, diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py index 74bbc5c51..e0a13dd76 100644 --- a/youtube_dl/extractor/ellentv.py +++ b/youtube_dl/extractor/ellentv.py @@ -1,13 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import json - from .common import InfoExtractor -from ..utils import ( - ExtractorError, - NO_DEFAULT, -) +from .kaltura import KalturaIE +from ..utils import NO_DEFAULT class EllenTVIE(InfoExtractor): @@ -65,7 +61,7 @@ class EllenTVIE(InfoExtractor): if partner_id and kaltura_id: break - return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura') + return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), KalturaIE.ie_key()) class EllenTVClipsIE(InfoExtractor): @@ -77,14 +73,14 @@ class EllenTVClipsIE(InfoExtractor): 'id': 'meryl-streep-vanessa-hudgens', 'title': 'Meryl Streep, Vanessa Hudgens', }, - 'playlist_mincount': 7, + 'playlist_mincount': 5, } def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - playlist = self._extract_playlist(webpage) + playlist = self._extract_playlist(webpage, playlist_id) return { '_type': 'playlist', @@ -93,16 +89,13 @@ class EllenTVClipsIE(InfoExtractor): 'entries': self._extract_entries(playlist) } - def _extract_playlist(self, webpage): + def _extract_playlist(self, webpage, playlist_id): json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json') - try: - return json.loads('[{' + json_string + '}]') - except ValueError as ve: - raise ExtractorError('Failed to download JSON', cause=ve) + return self._parse_json('[{' + json_string + '}]', playlist_id) def _extract_entries(self, playlist): return [ self.url_result( 'kaltura:%s:%s' % (item['kaltura_partner_id'], item['kaltura_entry_id']), - 'Kaltura') + KalturaIE.ie_key(), video_id=item['kaltura_entry_id']) for item in playlist] diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py index 99e00cf3c..b89f6db62 100644 --- a/youtube_dl/extractor/elpais.py +++ b/youtube_dl/extractor/elpais.py @@ -39,6 +39,18 @@ class ElPaisIE(InfoExtractor): 'description': 'La nave portaba cientos de ánforas y se hundió cerca de la isla de Cabrera por razones desconocidas', 'upload_date': '20170127', }, + }, { + 'url': 'http://epv.elpais.com/epv/2017/02/14/programa_la_voz_de_inaki/1487062137_075943.html', + 'info_dict': { + 'id': '1487062137_075943', + 'ext': 'mp4', + 'title': 'Disyuntivas', + 'description': 'md5:a0fb1485c4a6a8a917e6f93878e66218', + 'upload_date': '20170214', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -59,14 +71,15 @@ class ElPaisIE(InfoExtractor): video_url = prefix + video_suffix thumbnail_suffix = self._search_regex( r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", - webpage, 'thumbnail URL', fatal=False) + webpage, 'thumbnail URL', default=None) thumbnail = ( None if thumbnail_suffix is None - else prefix + thumbnail_suffix) + else prefix + thumbnail_suffix) or self._og_search_thumbnail(webpage) title = self._html_search_regex( - (r"tituloVideo\s*=\s*'([^']+)'", webpage, 'title', - r'

]*>([^<]+)'), + webpage, 'title', default=None) or self._og_search_title(webpage) upload_date = unified_strdate(self._search_regex( r'

', webpage, 'upload date', default=None) or self._html_search_meta( diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 76ad7c40b..55b4782d3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -694,6 +694,8 @@ from .ondemandkorea import OnDemandKoreaIE from .onet import ( OnetIE, OnetChannelIE, + OnetMVPIE, + OnetPlIE, ) from .onionstudios import OnionStudiosIE from .ooyala import ( @@ -1007,6 +1009,7 @@ from .tvc import ( ) from .tvigle import TvigleIE from .tvland import TVLandIE +from .tvn24 import TVN24IE from .tvnoe import TVNoeIE from .tvp import ( TVPEmbedIE, @@ -1147,6 +1150,7 @@ from .vlive import ( VLiveChannelIE ) from .vodlocker import VodlockerIE +from .vodpl import VODPlIE from .vodplatform import VODPlatformIE from .voicerepublic import VoiceRepublicIE from .voxmedia import VoxMediaIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1c233f038..9868ca6d0 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -20,6 +20,7 @@ from ..utils import ( float_or_none, HEADRequest, is_html, + js_to_json, orderedSet, sanitized_Request, smuggle_url, @@ -961,6 +962,16 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + # Complex jwplayer + { + 'url': 'http://www.indiedb.com/games/king-machine/videos', + 'info_dict': { + 'id': 'videos', + 'ext': 'mp4', + 'title': 'king machine trailer 1', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', @@ -991,19 +1002,6 @@ class GenericIE(InfoExtractor): 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', }, }, - # Kaltura embed protected with referrer - { - 'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero', - 'info_dict': { - 'id': '1_g4fbemnq', - 'ext': 'mp4', - 'title': 'Violetta - Achter De Schermen - Ruggero', - 'description': 'Achter de schermen met Ruggero', - 'timestamp': 1435133761, - 'upload_date': '20150624', - 'uploader_id': 'echojecka', - }, - }, # Kaltura embed with single quotes { 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', @@ -1503,7 +1501,12 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, 'add_ie': [VideoPressIE.ie_key()], - } + }, + { + # ThePlatform embedded with whitespaces in URLs + 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', + 'only_matching': True, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2350,8 +2353,9 @@ class GenericIE(InfoExtractor): 'Channel': 'channel', 'ChannelList': 'channel_list', } - return self.url_result('limelight:%s:%s' % ( - lm[mobj.group(1)], mobj.group(2)), 'Limelight%s' % mobj.group(1), mobj.group(2)) + return self.url_result(smuggle_url('limelight:%s:%s' % ( + lm[mobj.group(1)], mobj.group(2)), {'source_url': url}), + 'Limelight%s' % mobj.group(1), mobj.group(2)) mobj = re.search( r'''(?sx) @@ -2361,7 +2365,9 @@ class GenericIE(InfoExtractor): value=(["\'])(?:(?!\3).)*mediaId=(?P[a-z0-9]{32}) ''', webpage) if mobj: - return self.url_result('limelight:media:%s' % mobj.group('id')) + return self.url_result(smuggle_url( + 'limelight:media:%s' % mobj.group('id'), + {'source_url': url}), 'LimelightMedia', mobj.group('id')) # Look for AdobeTVVideo embeds mobj = re.search( @@ -2498,6 +2504,15 @@ class GenericIE(InfoExtractor): self._sort_formats(entry['formats']) return self.playlist_result(entries) + jwplayer_data_str = self._find_jwplayer_data(webpage) + if jwplayer_data_str: + try: + jwplayer_data = self._parse_json( + jwplayer_data_str, video_id, transform_source=js_to_json) + return self._parse_jwplayer_data(jwplayer_data, video_id) + except ExtractorError: + pass + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index f28e6fbf5..b205bfc7c 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -37,6 +37,7 @@ class GoIE(AdobePassIE): } } _VALID_URL = r'https?://(?:(?P%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P\w+)|season-\d+/\d+-(?P[^/?#]+))' % '|'.join(_SITE_INFO.keys()) + _GEO_COUNTRIES = ['US'] _TESTS = [{ 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', 'info_dict': { @@ -101,6 +102,10 @@ class GoIE(AdobePassIE): video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers()) errors = entitlement.get('errors', {}).get('errors', []) if errors: + for error in errors: + if error.get('code') == 1002: + self.raise_geo_restricted( + error['message'], countries=self._GEO_COUNTRIES) error_message = ', '.join([error['message'] for error in errors]) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) asset_url += '?' + entitlement['uplynkData']['sessionKey'] diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 1629cdb8d..382f32771 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -6,59 +6,58 @@ from ..utils import ( determine_ext, int_or_none, parse_iso8601, + xpath_text, ) class HeiseIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://(?:www\.)?heise\.de/video/artikel/ - .+?(?P[0-9]+)\.html(?:$|[?#]) - ''' - _TEST = { - 'url': ( - 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html' - ), + _VALID_URL = r'https?://(?:www\.)?heise\.de/(?:[^/]+/)+[^/]+-(?P[0-9]+)\.html' + _TESTS = [{ + 'url': 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html', 'md5': 'ffed432483e922e88545ad9f2f15d30e', 'info_dict': { 'id': '2404147', 'ext': 'mp4', - 'title': ( - "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone" - ), + 'title': "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone", 'format_id': 'mp4_720p', 'timestamp': 1411812600, 'upload_date': '20140927', - 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', - 'thumbnail': r're:^https?://.*\.jpe?g$', + 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', + 'thumbnail': r're:^https?://.*/gallery/$', } - } + }, { + 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', + 'only_matching': True, + }, { + 'url': 'http://www.heise.de/newsticker/meldung/c-t-uplink-Owncloud-Tastaturen-Peilsender-Smartphone-2404251.html?wt_mc=rss.ho.beitrag.atom', + 'only_matching': True, + }, { + 'url': 'http://www.heise.de/ct/ausgabe/2016-12-Spiele-3214137.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) container_id = self._search_regex( - r'

]+data-container="([0-9]+)"', webpage, 'container ID') sequenz_id = self._search_regex( - r'
]+data-sequenz="([0-9]+)"', webpage, 'sequenz ID') - data_url = 'http://www.heise.de/videout/feed?container=%s&sequenz=%s' % (container_id, sequenz_id) - doc = self._download_xml(data_url, video_id) - info = { - 'id': video_id, - 'thumbnail': self._og_search_thumbnail(webpage), - 'timestamp': parse_iso8601( - self._html_search_meta('date', webpage)), - 'description': self._og_search_description(webpage), - } + title = self._html_search_meta('fulltitle', webpage, default=None) + if not title or title == "c't": + title = self._search_regex( + r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', + webpage, 'title') - title = self._html_search_meta('fulltitle', webpage) - if title: - info['title'] = title - else: - info['title'] = self._og_search_title(webpage) + doc = self._download_xml( + 'http://www.heise.de/videout/feed', video_id, query={ + 'container': container_id, + 'sequenz': sequenz_id, + }) formats = [] for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'): @@ -74,6 +73,18 @@ class HeiseIE(InfoExtractor): 'height': height, }) self._sort_formats(formats) - info['formats'] = formats - return info + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': (xpath_text(doc, './/{http://rss.jwpcdn.com/}image') or + self._og_search_thumbnail(webpage)), + 'timestamp': parse_iso8601( + self._html_search_meta('date', webpage)), + 'formats': formats, + } diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index f05d765d6..3a7a66a34 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -34,11 +34,9 @@ class HotStarIE(InfoExtractor): 'only_matching': True, }] - _GET_CONTENT_TEMPLATE = 'http://account.hotstar.com/AVS/besc?action=GetAggregatedContentDetails&channel=PCTV&contentId=%s' - _GET_CDN_TEMPLATE = 'http://getcdn.hotstar.com/AVS/besc?action=GetCDN&asJson=Y&channel=%s&id=%s&type=%s' - - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True): - json_data = super(HotStarIE, self)._download_json(url_or_request, video_id, note, fatal=fatal) + def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True, query=None): + json_data = super(HotStarIE, self)._download_json( + url_or_request, video_id, note, fatal=fatal, query=query) if json_data['resultCode'] != 'OK': if fatal: raise ExtractorError(json_data['errorDescription']) @@ -48,20 +46,37 @@ class HotStarIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( - self._GET_CONTENT_TEMPLATE % video_id, - video_id)['contentInfo'][0] + 'http://account.hotstar.com/AVS/besc', video_id, query={ + 'action': 'GetAggregatedContentDetails', + 'channel': 'PCTV', + 'contentId': video_id, + })['contentInfo'][0] + title = video_data['episodeTitle'] + + if video_data.get('encrypted') == 'Y': + raise ExtractorError('This video is DRM protected.', expected=True) formats = [] - # PCTV for extracting f4m manifest - for f in ('TABLET',): + for f in ('JIO',): format_data = self._download_json( - self._GET_CDN_TEMPLATE % (f, video_id, 'VOD'), - video_id, 'Downloading %s JSON metadata' % f, fatal=False) + 'http://getcdn.hotstar.com/AVS/besc', + video_id, 'Downloading %s JSON metadata' % f, + fatal=False, query={ + 'action': 'GetCDN', + 'asJson': 'Y', + 'channel': f, + 'id': video_id, + 'type': 'VOD', + }) if format_data: - format_url = format_data['src'] + format_url = format_data.get('src') + if not format_url: + continue ext = determine_ext(format_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) elif ext == 'f4m': # produce broken files continue @@ -75,9 +90,12 @@ class HotStarIE(InfoExtractor): return { 'id': video_id, - 'title': video_data['episodeTitle'], + 'title': title, 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(video_data.get('broadcastDate')), 'formats': formats, + 'episode': title, + 'episode_number': int_or_none(video_data.get('episodeNumber')), + 'series': video_data.get('contentTitle'), } diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index b0d860452..021c6b278 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -24,6 +24,7 @@ from ..utils import ( class ITVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-zA-Z]+)' + _GEO_COUNTRIES = ['GB'] _TEST = { 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', 'info_dict': { @@ -98,7 +99,11 @@ class ITVIE(InfoExtractor): headers=headers, data=etree.tostring(req_env)) playlist = xpath_element(resp_env, './/Playlist') if playlist is None: + fault_code = xpath_text(resp_env, './/faultcode') fault_string = xpath_text(resp_env, './/faultstring') + if fault_code == 'InvalidGeoRegion': + self.raise_geo_restricted( + msg=fault_string, countries=self._GEO_COUNTRIES) raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string)) title = xpath_text(playlist, 'EpisodeTitle', fatal=True) video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index aff7ab49a..33d55f770 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,139 +4,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - determine_ext, - float_or_none, - int_or_none, - js_to_json, - mimetype2ext, - urljoin, -) -class JWPlatformBaseIE(InfoExtractor): - @staticmethod - def _find_jwplayer_data(webpage): - # TODO: Merge this with JWPlayer-related codes in generic.py - - mobj = re.search( - r'jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P[^)]+)\)', - webpage) - if mobj: - return mobj.group('options') - - def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._parse_json( - self._find_jwplayer_data(webpage), video_id, - transform_source=js_to_json) - return self._parse_jwplayer_data( - jwplayer_data, video_id, *args, **kwargs) - - def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, - m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - - entries = [] - - # JWPlayer backward compatibility: single playlist item - # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] - - for video_data in jwplayer_data['playlist']: - # JWPlayer backward compatibility: flattened sources - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 - if 'sources' not in video_data: - video_data['sources'] = [video_data] - - this_video_id = video_id or video_data['mediaid'] - - formats = [] - for source in video_data['sources']: - source_url = self._proto_relative_url(source['file']) - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - source_url, this_video_id, mpd_id=mpd_id, fatal=False)) - # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 - elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - 'ext': ext, - }) - else: - height = int_or_none(source.get('height')) - if height is None: - # Often no height is provided but there is a label in - # format like 1080p. - height = int_or_none(self._search_regex( - r'^(\d{3,})[pP]$', source.get('label') or '', - 'height', default=None)) - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': height, - 'ext': ext, - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv' - - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) - self._sort_formats(formats) - - subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if track.get('kind') != 'captions': - continue - track_url = urljoin(base_url, track.get('file')) - if not track_url: - continue - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track_url) - }) - - entries.append({ - 'id': this_video_id, - 'title': video_data['title'] if require_title else video_data.get('title'), - 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), - 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), - 'subtitles': subtitles, - 'formats': formats, - }) - if len(entries) == 1: - return entries[0] - else: - return self.playlist_result(entries) - - -class JWPlatformIE(JWPlatformBaseIE): +class JWPlatformIE(InfoExtractor): _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P[a-zA-Z0-9]{8})' _TEST = { 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', diff --git a/youtube_dl/extractor/lemonde.py b/youtube_dl/extractor/lemonde.py index 42568f315..3306892e8 100644 --- a/youtube_dl/extractor/lemonde.py +++ b/youtube_dl/extractor/lemonde.py @@ -7,20 +7,40 @@ class LemondeIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?lemonde\.fr/(?:[^/]+/)*(?P[^/]+)\.html' _TESTS = [{ 'url': 'http://www.lemonde.fr/police-justice/video/2016/01/19/comprendre-l-affaire-bygmalion-en-cinq-minutes_4849702_1653578.html', - 'md5': '01fb3c92de4c12c573343d63e163d302', + 'md5': 'da120c8722d8632eec6ced937536cc98', 'info_dict': { 'id': 'lqm3kl', 'ext': 'mp4', 'title': "Comprendre l'affaire Bygmalion en 5 minutes", 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 320, + 'duration': 309, 'upload_date': '20160119', 'timestamp': 1453194778, 'uploader_id': '3pmkp', }, + }, { + # standard iframe embed + 'url': 'http://www.lemonde.fr/les-decodeurs/article/2016/10/18/tout-comprendre-du-ceta-le-petit-cousin-du-traite-transatlantique_5015920_4355770.html', + 'info_dict': { + 'id': 'uzsxms', + 'ext': 'mp4', + 'title': "CETA : quelles suites pour l'accord commercial entre l'Europe et le Canada ?", + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 325, + 'upload_date': '20161021', + 'timestamp': 1477044540, + 'uploader_id': '3pmkp', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://redaction.actu.lemonde.fr/societe/video/2016/01/18/calais-debut-des-travaux-de-defrichement-dans-la-jungle_4849233_3224.html', 'only_matching': True, + }, { + # YouTube embeds + 'url': 'http://www.lemonde.fr/pixels/article/2016/12/09/pourquoi-pewdiepie-superstar-de-youtube-a-menace-de-fermer-sa-chaine_5046649_4408996.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -30,5 +50,9 @@ class LemondeIE(InfoExtractor): digiteka_url = self._proto_relative_url(self._search_regex( r'url\s*:\s*(["\'])(?P(?:https?://)?//(?:www\.)?(?:digiteka\.net|ultimedia\.com)/deliver/.+?)\1', - webpage, 'digiteka url', group='url')) - return self.url_result(digiteka_url, 'Digiteka') + webpage, 'digiteka url', group='url', default=None)) + + if digiteka_url: + return self.url_result(digiteka_url, 'Digiteka') + + return self.url_result(url, 'Generic') diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index e635f3c4d..a3712665b 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + unsmuggle_url, ) @@ -15,20 +16,23 @@ class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' - def _call_playlist_service(self, item_id, method, fatal=True): + def _call_playlist_service(self, item_id, method, fatal=True, referer=None): + headers = {} + if referer: + headers['Referer'] = referer return self._download_json( self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), - item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal) + item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal, headers=headers) def _call_api(self, organization_id, item_id, method): return self._download_json( self._API_URL % (organization_id, self._API_PATH, item_id, method), item_id, 'Downloading API %s JSON' % method) - def _extract(self, item_id, pc_method, mobile_method, meta_method): - pc = self._call_playlist_service(item_id, pc_method) + def _extract(self, item_id, pc_method, mobile_method, meta_method, referer=None): + pc = self._call_playlist_service(item_id, pc_method, referer=referer) metadata = self._call_api(pc['orgId'], item_id, meta_method) - mobile = self._call_playlist_service(item_id, mobile_method, fatal=False) + mobile = self._call_playlist_service(item_id, mobile_method, fatal=False, referer=referer) return pc, mobile, metadata def _extract_info(self, streams, mobile_urls, properties): @@ -207,10 +211,13 @@ class LimelightMediaIE(LimelightBaseIE): _API_PATH = 'media' def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) video_id = self._match_id(url) pc, mobile, metadata = self._extract( - video_id, 'getPlaylistByMediaId', 'getMobilePlaylistByMediaId', 'properties') + video_id, 'getPlaylistByMediaId', + 'getMobilePlaylistByMediaId', 'properties', + smuggled_data.get('source_url')) return self._extract_info( pc['playlistItems'][0].get('streams', []), @@ -247,11 +254,13 @@ class LimelightChannelIE(LimelightBaseIE): _API_PATH = 'channels' def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) channel_id = self._match_id(url) pc, mobile, medias = self._extract( channel_id, 'getPlaylistByChannelId', - 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', 'media') + 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', + 'media', smuggled_data.get('source_url')) entries = [ self._extract_info( diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 9880924e6..28f59f63c 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -6,12 +6,12 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, ) from ..utils import ( determine_ext, ExtractorError, int_or_none, - urlencode_postdata, get_element_by_attribute, mimetype2ext, ) @@ -50,6 +50,21 @@ class MetacafeIE(InfoExtractor): }, 'skip': 'Page is temporarily unavailable.', }, + # metacafe video with family filter + { + 'url': 'http://www.metacafe.com/watch/2155630/adult_art_by_david_hart_156/', + 'md5': 'b06082c5079bbdcde677a6291fbdf376', + 'info_dict': { + 'id': '2155630', + 'ext': 'mp4', + 'title': 'Adult Art By David Hart 156', + 'uploader': '63346', + 'description': 'md5:9afac8fc885252201ad14563694040fc', + }, + 'params': { + 'skip_download': True, + }, + }, # AnyClip video { 'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/', @@ -112,22 +127,6 @@ class MetacafeIE(InfoExtractor): def report_disclaimer(self): self.to_screen('Retrieving disclaimer') - def _confirm_age(self): - # Retrieve disclaimer - self.report_disclaimer() - self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer') - - # Confirm age - self.report_age_confirmation() - self._download_webpage( - self._FILTER_POST, None, False, 'Unable to confirm age', - data=urlencode_postdata({ - 'filters': '0', - 'submit': "Continue - I'm over 18", - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - def _real_extract(self, url): # Extract id and simplified title from URL video_id, display_id = re.match(self._VALID_URL, url).groups() @@ -143,13 +142,15 @@ class MetacafeIE(InfoExtractor): if prefix == 'cb': return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') - # self._confirm_age() + headers = { + # Disable family filter + 'Cookie': 'user=%s; ' % compat_urllib_parse_urlencode({'ffilter': False}) + } # AnyClip videos require the flashversion cookie so that we get the link # to the mp4 file - headers = {} if video_id.startswith('an-'): - headers['Cookie'] = 'flashVersion=0;' + headers['Cookie'] += 'flashVersion=0; ' # Retrieve video webpage to extract further information webpage = self._download_webpage(url, video_id, headers=headers) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index fc3c0cd3c..7fe79cb53 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import random import re from .common import InfoExtractor @@ -15,24 +14,7 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): - _faked_ip = None - - def _download_webpage_handle(self, *args, **kwargs): - # NRK checks X-Forwarded-For HTTP header in order to figure out the - # origin of the client behind proxy. This allows to bypass geo - # restriction by faking this header's value to some Norway IP. - # We will do so once we encounter any geo restriction error. - if self._faked_ip: - # NB: str is intentional - kwargs.setdefault(str('headers'), {})['X-Forwarded-For'] = self._faked_ip - return super(NRKBaseIE, self)._download_webpage_handle(*args, **kwargs) - - def _fake_ip(self): - # Use fake IP from 37.191.128.0/17 in order to workaround geo - # restriction - def octet(lb=0, ub=255): - return random.randint(lb, ub) - self._faked_ip = '37.191.%d.%d' % (octet(128), octet()) + _GEO_COUNTRIES = ['NO'] def _real_extract(self, url): video_id = self._match_id(url) @@ -44,8 +26,6 @@ class NRKBaseIE(InfoExtractor): title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id - http_headers = {'X-Forwarded-For': self._faked_ip} if self._faked_ip else {} - entries = [] conviva = data.get('convivaStatistics') or {} @@ -90,7 +70,6 @@ class NRKBaseIE(InfoExtractor): 'duration': duration, 'subtitles': subtitles, 'formats': formats, - 'http_headers': http_headers, }) if not entries: @@ -107,19 +86,17 @@ class NRKBaseIE(InfoExtractor): }] if not entries: - message_type = data.get('messageType', '') - # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type and not self._faked_ip: - self.report_warning( - 'Video is geo restricted, trying to fake IP') - self._fake_ip() - return self._real_extract(url) - MESSAGES = { 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', 'ProgramRightsHasExpired': 'Programmet har gått ut', 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', } + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type: + self.raise_geo_restricted( + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) raise ExtractorError( '%s said: %s' % (self.IE_NAME, MESSAGES.get( message_type, message_type)), @@ -188,12 +165,12 @@ class NRKIE(NRKBaseIE): https?:// (?: (?:www\.)?nrk\.no/video/PS\*| - v8-psapi\.nrk\.no/mediaelement/ + v8[-.]psapi\.nrk\.no/mediaelement/ ) ) - (?P[^/?#&]+) + (?P[^?#&]+) ''' - _API_HOST = 'v8.psapi.nrk.no' + _API_HOST = 'v8-psapi.nrk.no' _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', @@ -219,6 +196,9 @@ class NRKIE(NRKBaseIE): }, { 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, }, { 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', 'only_matching': True, diff --git a/youtube_dl/extractor/ondemandkorea.py b/youtube_dl/extractor/ondemandkorea.py index de1d6b08a..df1ce3c1d 100644 --- a/youtube_dl/extractor/ondemandkorea.py +++ b/youtube_dl/extractor/ondemandkorea.py @@ -1,15 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( ExtractorError, js_to_json, ) -class OnDemandKoreaIE(JWPlatformBaseIE): +class OnDemandKoreaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P[^/]+)\.html' + _GEO_COUNTRIES = ['US', 'CA'] _TEST = { 'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html', 'info_dict': { @@ -35,7 +36,8 @@ class OnDemandKoreaIE(JWPlatformBaseIE): if 'msg_block_01.png' in webpage: self.raise_geo_restricted( - 'This content is not available in your region') + msg='This content is not available in your region', + countries=self._GEO_COUNTRIES) if 'This video is only available to ODK PLUS members.' in webpage: raise ExtractorError( diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index 0a501b3e5..94f57990b 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -23,7 +23,7 @@ class OnetBaseIE(InfoExtractor): return self._search_regex( r'id=(["\'])mvp:(?P.+?)\1', webpage, 'mvp id', group='id') - def _extract_from_id(self, video_id, webpage): + def _extract_from_id(self, video_id, webpage=None): response = self._download_json( 'http://qi.ckm.onetapi.pl/', video_id, query={ @@ -74,8 +74,10 @@ class OnetBaseIE(InfoExtractor): meta = video.get('meta', {}) - title = self._og_search_title(webpage, default=None) or meta['title'] - description = self._og_search_description(webpage, default=None) or meta.get('description') + title = (self._og_search_title( + webpage, default=None) if webpage else None) or meta['title'] + description = (self._og_search_description( + webpage, default=None) if webpage else None) or meta.get('description') duration = meta.get('length') or meta.get('lenght') timestamp = parse_iso8601(meta.get('addDate'), ' ') @@ -89,6 +91,18 @@ class OnetBaseIE(InfoExtractor): } +class OnetMVPIE(OnetBaseIE): + _VALID_URL = r'onetmvp:(?P\d+\.\d+)' + + _TEST = { + 'url': 'onetmvp:381027.1509591944', + 'only_matching': True, + } + + def _real_extract(self, url): + return self._extract_from_id(self._match_id(url)) + + class OnetIE(OnetBaseIE): _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' IE_NAME = 'onet.tv' @@ -167,3 +181,44 @@ class OnetChannelIE(OnetBaseIE): channel_title = strip_or_none(get_element_by_class('o_channelName', webpage)) channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage)) return self.playlist_result(entries, channel_id, channel_title, channel_description) + + +class OnetPlIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?(?:onet|businessinsider\.com|plejada)\.pl/(?:[^/]+/)+(?P[0-9a-z]+)' + IE_NAME = 'onet.pl' + + _TESTS = [{ + 'url': 'http://eurosport.onet.pl/zimowe/skoki-narciarskie/ziobro-wygral-kwalifikacje-w-pjongczangu/9ckrly', + 'md5': 'b94021eb56214c3969380388b6e73cb0', + 'info_dict': { + 'id': '1561707.1685479', + 'ext': 'mp4', + 'title': 'Ziobro wygrał kwalifikacje w Pjongczangu', + 'description': 'md5:61fb0740084d2d702ea96512a03585b4', + 'upload_date': '20170214', + 'timestamp': 1487078046, + }, + }, { + 'url': 'http://film.onet.pl/zwiastuny/ghost-in-the-shell-drugi-zwiastun-pl/5q6yl3', + 'only_matching': True, + }, { + 'url': 'http://moto.onet.pl/jak-wybierane-sa-miejsca-na-fotoradary/6rs04e', + 'only_matching': True, + }, { + 'url': 'http://businessinsider.com.pl/wideo/scenariusz-na-koniec-swiata-wedlug-nasa/dwnqptk', + 'only_matching': True, + }, { + 'url': 'http://plejada.pl/weronika-rosati-o-swoim-domniemanym-slubie/n2bq89', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + mvp_id = self._search_regex( + r'data-params-mvp=["\'](\d+\.\d+)', webpage, 'mvp id') + + return self.url_result( + 'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 32289d897..10896c442 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -75,17 +75,17 @@ class OpenloadIE(InfoExtractor): ']+id="[^"]+"[^>]*>([0-9]+)', webpage, 'openload ID') - first_three_chars = int(float(ol_id[0:][:3])) - fifth_char = int(float(ol_id[3:5])) - urlcode = '' - num = 5 + first_two_chars = int(float(ol_id[0:][:2])) + urlcode = [] + num = 2 while num < len(ol_id): - urlcode += compat_chr(int(float(ol_id[num:][:3])) + - first_three_chars - fifth_char * int(float(ol_id[num + 3:][:2]))) + key = int(float(ol_id[num + 3:][:2])) + urlcode.append((key, compat_chr(int(float(ol_id[num:][:3])) - first_two_chars))) num += 5 - video_url = 'https://openload.co/stream/' + urlcode + video_url = 'https://openload.co/stream/' + ''.join( + [value for _, value in sorted(urlcode, key=lambda x: x[0])]) title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 6baed773f..3e51b4dd7 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -193,6 +193,8 @@ class PBSIE(InfoExtractor): ) ''' % '|'.join(list(zip(*_STATIONS))[0]) + _GEO_COUNTRIES = ['US'] + _TESTS = [ { 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', @@ -489,11 +491,13 @@ class PBSIE(InfoExtractor): headers=self.geo_verification_headers()) if redirect_info['status'] == 'error': + message = self._ERRORS.get( + redirect_info['http_code'], redirect_info['message']) + if redirect_info['http_code'] == 403: + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( - '%s said: %s' % ( - self.IE_NAME, - self._ERRORS.get(redirect_info['http_code'], redirect_info['message'])), - expected=True) + '%s said: %s' % (self.IE_NAME, message), expected=True) format_url = redirect_info.get('url') if not format_url: diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py index 6a4580d54..9f3501f77 100644 --- a/youtube_dl/extractor/pinkbike.py +++ b/youtube_dl/extractor/pinkbike.py @@ -64,7 +64,8 @@ class PinkbikeIE(InfoExtractor): 'video:duration', webpage, 'duration')) uploader = self._search_regex( - r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + r']+\brel=["\']author[^>]+>([^<]+)', webpage, + 'uploader', fatal=False) upload_date = unified_strdate(self._search_regex( r'class="fullTime"[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False)) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 818d99c1f..9b413590a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -2,27 +2,27 @@ from __future__ import unicode_literals import itertools -import os +# import os import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlparse, + # compat_urllib_parse_unquote, + # compat_urllib_parse_unquote_plus, + # compat_urllib_parse_urlparse, ) from ..utils import ( ExtractorError, int_or_none, js_to_json, orderedSet, - sanitized_Request, + # sanitized_Request, str_to_int, ) -from ..aes import ( - aes_decrypt_text -) +# from ..aes import ( +# aes_decrypt_text +# ) class PornHubIE(InfoExtractor): @@ -109,10 +109,14 @@ class PornHubIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = sanitized_Request( - 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + def dl_webpage(platform): + return self._download_webpage( + 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, + video_id, headers={ + 'Cookie': 'age_verified=1; platform=%s' % platform, + }) + + webpage = dl_webpage('pc') error_msg = self._html_search_regex( r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)
', @@ -123,10 +127,19 @@ class PornHubIE(InfoExtractor): 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) + tv_webpage = dl_webpage('tv') + + video_url = self._search_regex( + r']+\bsrc=(["\'])(?P(?:https?:)?//.+?)\1', tv_webpage, + 'video url', group='url') + + title = self._search_regex( + r'

([^>]+)

', tv_webpage, 'title', default=None) + # video_title from flashvars contains whitespace instead of non-ASCII (see # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. - title = self._html_search_meta( + title = title or self._html_search_meta( 'twitter:title', webpage, default=None) or self._search_regex( (r']+class=["\']title["\'][^>]*>(?P[^<]+)', r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', @@ -156,48 +169,6 @@ class PornHubIE(InfoExtractor): comment_count = self._extract_count( r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') - video_variables = {} - for video_variablename, quote, video_variable in re.findall( - r'(player_quality_[0-9]{3,4}p\w+)\s*=\s*(["\'])(.+?)\2;', webpage): - video_variables[video_variablename] = video_variable - - video_urls = [] - for encoded_video_url in re.findall( - r'player_quality_[0-9]{3,4}p\s*=(.+?);', webpage): - for varname, varval in video_variables.items(): - encoded_video_url = encoded_video_url.replace(varname, varval) - video_urls.append(re.sub(r'[\s+]', '', encoded_video_url)) - - if webpage.find('"encrypted":true') != -1: - password = compat_urllib_parse_unquote_plus( - self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) - video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) - - formats = [] - for video_url in video_urls: - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format = '-'.join(format) - - m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format) - if m is None: - height = None - tbr = None - else: - height = int(m.group('height')) - tbr = int(m.group('tbr')) - - formats.append({ - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, - 'tbr': tbr, - 'height': height, - }) - self._sort_formats(formats) - page_params = self._parse_json(self._search_regex( r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})', webpage, 'page parameters', group='data', default='{}'), @@ -209,6 +180,7 @@ class PornHubIE(InfoExtractor): return { 'id': video_id, + 'url': video_url, 'uploader': video_uploader, 'title': title, 'thumbnail': thumbnail, @@ -217,7 +189,7 @@ class PornHubIE(InfoExtractor): 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - 'formats': formats, + # 'formats': formats, 'age_limit': 18, 'tags': tags, 'categories': categories, diff --git a/youtube_dl/extractor/pornoxo.py b/youtube_dl/extractor/pornoxo.py index 1a0cce7e0..2831368b6 100644 --- a/youtube_dl/extractor/pornoxo.py +++ b/youtube_dl/extractor/pornoxo.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( str_to_int, ) -class PornoXOIE(JWPlatformBaseIE): +class PornoXOIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)\.html' _TEST = { 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html', diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 5091d8456..1245309a7 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -424,3 +424,6 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): return self._extract_clip(url, webpage) elif page_type == 'playlist': return self._extract_playlist(url, webpage) + else: + raise ExtractorError( + 'Unsupported page type %s' % page_type, expected=True) diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py index 422c02cff..d338b3a93 100644 --- a/youtube_dl/extractor/rentv.py +++ b/youtube_dl/extractor/rentv.py @@ -2,11 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .jwplatform import JWPlatformBaseIE from ..compat import compat_str -class RENTVIE(JWPlatformBaseIE): +class RENTVIE(InfoExtractor): _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://ren.tv/video/epizod/118577', diff --git a/youtube_dl/extractor/rudo.py b/youtube_dl/extractor/rudo.py index 3bfe934d8..51644011e 100644 --- a/youtube_dl/extractor/rudo.py +++ b/youtube_dl/extractor/rudo.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( js_to_json, get_element_by_class, @@ -11,7 +11,7 @@ from ..utils import ( ) -class RudoIE(JWPlatformBaseIE): +class RudoIE(InfoExtractor): _VALID_URL = r'https?://rudo\.video/vod/(?P<id>[0-9a-zA-Z]+)' _TEST = { diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index 94a2a37d2..b5e76c9af 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -1,11 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import js_to_json -class ScreencastOMaticIE(JWPlatformBaseIE): +class ScreencastOMaticIE(InfoExtractor): _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)' _TEST = { 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py index 9880a5a78..9d9652949 100644 --- a/youtube_dl/extractor/sendtonews.py +++ b/youtube_dl/extractor/sendtonews.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( float_or_none, parse_iso8601, @@ -14,7 +14,7 @@ from ..utils import ( ) -class SendtoNewsIE(JWPlatformBaseIE): +class SendtoNewsIE(InfoExtractor): _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P<id>[0-9A-Za-z-]+)' _TEST = { diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index 319a48a7a..bb73eb1d5 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -14,6 +14,8 @@ from ..utils import ( class SRGSSRIE(InfoExtractor): _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['CH'] _ERRORS = { 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', @@ -40,8 +42,12 @@ class SRGSSRIE(InfoExtractor): media_id)[media_type.capitalize()] if media_data.get('block') and media_data['block'] in self._ERRORS: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, self._ERRORS[media_data['block']]), expected=True) + message = self._ERRORS[media_data['block']] + if media_data['block'] == 'GEOBLOCK': + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) return media_data diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 10cf80885..1b5afb73e 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -13,6 +13,8 @@ from ..utils import ( class SVTBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['SE'] + def _extract_video(self, video_info, video_id): formats = [] for vr in video_info['videoReferences']: @@ -38,7 +40,9 @@ class SVTBaseIE(InfoExtractor): 'url': vurl, }) if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): - self.raise_geo_restricted('This video is only available in Sweden') + self.raise_geo_restricted( + 'This video is only available in Sweden', + countries=self._GEO_COUNTRIES) self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 5c5987c6a..9a424b1c6 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -179,10 +179,12 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): if m: return [m.group('url')] + # Are whitesapces ignored in URLs? + # https://github.com/rg3/youtube-dl/issues/12044 matches = re.findall( - r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) + r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) if matches: - return list(zip(*matches))[1] + return [re.sub(r'\s', '', list(zip(*matches))[1][0])] @staticmethod def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False): diff --git a/youtube_dl/extractor/thisav.py b/youtube_dl/extractor/thisav.py index 4473a3c77..33683b139 100644 --- a/youtube_dl/extractor/thisav.py +++ b/youtube_dl/extractor/thisav.py @@ -3,13 +3,14 @@ from __future__ import unicode_literals import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import remove_end -class ThisAVIE(JWPlatformBaseIE): +class ThisAVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*' _TESTS = [{ + # jwplayer 'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html', 'md5': '0480f1ef3932d901f0e0e719f188f19b', 'info_dict': { @@ -20,6 +21,7 @@ class ThisAVIE(JWPlatformBaseIE): 'uploader_id': 'dj7970' } }, { + # html5 media 'url': 'http://www.thisav.com/video/242352/nerdy-18yo-big-ass-tattoos-and-glasses.html', 'md5': 'ba90c076bd0f80203679e5b60bf523ee', 'info_dict': { @@ -48,8 +50,12 @@ class ThisAVIE(JWPlatformBaseIE): }], } else: - info_dict = self._extract_jwplayer_data( - webpage, video_id, require_title=False) + entries = self._parse_html5_media_entries(url, webpage, video_id) + if entries: + info_dict = entries[0] + else: + info_dict = self._extract_jwplayer_data( + webpage, video_id, require_title=False) uploader = self._html_search_regex( r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>', webpage, 'uploader name', fatal=False) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index ad79db92b..7aeb2c620 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -24,6 +24,7 @@ class TV4IE(InfoExtractor): sport/| ) )(?P<id>[0-9]+)''' + _GEO_COUNTRIES = ['SE'] _TESTS = [ { 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650', @@ -71,16 +72,12 @@ class TV4IE(InfoExtractor): 'http://www.tv4play.se/player/assets/%s.json' % video_id, video_id, 'Downloading video info JSON') - # If is_geo_restricted is true, it doesn't necessarily mean we can't download it - if info.get('is_geo_restricted'): - self.report_warning('This content might not be available in your country due to licensing restrictions.') - title = info['title'] subtitles = {} formats = [] # http formats are linked with unresolvable host - for kind in ('hls', ''): + for kind in ('hls3', ''): data = self._download_json( 'https://prima.tv4play.se/api/web/asset/%s/play.json' % video_id, video_id, 'Downloading sources JSON', query={ @@ -113,6 +110,10 @@ class TV4IE(InfoExtractor): 'url': manifest_url, 'ext': 'vtt', }]}) + + if not formats and info.get('is_geo_restricted'): + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/tvn24.py b/youtube_dl/extractor/tvn24.py new file mode 100644 index 000000000..12ed6039c --- /dev/null +++ b/youtube_dl/extractor/tvn24.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unescapeHTML, +) + + +class TVN24IE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:[^/]+)\.)?tvn24(?:bis)?\.pl/(?:[^/]+/)*(?P<id>[^/]+)\.html' + _TESTS = [{ + 'url': 'http://www.tvn24.pl/wiadomosci-z-kraju,3/oredzie-artura-andrusa,702428.html', + 'md5': 'fbdec753d7bc29d96036808275f2130c', + 'info_dict': { + 'id': '1584444', + 'ext': 'mp4', + 'title': '"Święta mają być wesołe, dlatego, ludziska, wszyscy pod jemiołę"', + 'description': 'Wyjątkowe orędzie Artura Andrusa, jednego z gości "Szkła kontaktowego".', + 'thumbnail': 're:http://.*[.]jpeg', + } + }, { + 'url': 'http://fakty.tvn24.pl/ogladaj-online,60/53-konferencja-bezpieczenstwa-w-monachium,716431.html', + 'only_matching': True, + }, { + 'url': 'http://sport.tvn24.pl/pilka-nozna,105/ligue-1-kamil-glik-rozcial-glowe-monaco-tylko-remisuje-z-bastia,716522.html', + 'only_matching': True, + }, { + 'url': 'http://tvn24bis.pl/poranek,146,m/gen-koziej-w-tvn24-bis-wracamy-do-czasow-zimnej-wojny,715660.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + + def extract_json(attr, name, fatal=True): + return self._parse_json( + self._search_regex( + r'\b%s=(["\'])(?P<json>(?!\1).+?)\1' % attr, webpage, + name, group='json', fatal=fatal) or '{}', + video_id, transform_source=unescapeHTML, fatal=fatal) + + quality_data = extract_json('data-quality', 'formats') + + formats = [] + for format_id, url in quality_data.items(): + formats.append({ + 'url': url, + 'format_id': format_id, + 'height': int_or_none(format_id.rstrip('p')), + }) + self._sort_formats(formats) + + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._html_search_regex( + r'\bdata-poster=(["\'])(?P<url>(?!\1).+?)\1', webpage, + 'thumbnail', group='url') + + share_params = extract_json( + 'data-share-params', 'share params', fatal=False) + if isinstance(share_params, dict): + video_id = share_params.get('id') or video_id + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py index 6d5c74826..1a5b76bf2 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/youtube_dl/extractor/tvnoe.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( clean_html, get_element_by_class, @@ -9,7 +9,7 @@ from ..utils import ( ) -class TVNoeIE(JWPlatformBaseIE): +class TVNoeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.tvnoe.cz/video/10362', diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index bef639462..8152acefd 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -20,6 +20,7 @@ class Vbox7IE(InfoExtractor): ) (?P<id>[\da-fA-F]+) ''' + _GEO_COUNTRIES = ['BG'] _TESTS = [{ 'url': 'http://vbox7.com/play:0946fff23c', 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', @@ -78,7 +79,7 @@ class Vbox7IE(InfoExtractor): video_url = video['src'] if '/na.mp4' in video_url: - self.raise_geo_restricted() + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) uploader = video.get('uploader') diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 8a574bc26..0f8c156a7 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -14,6 +14,7 @@ from ..utils import ( class VGTVIE(XstreamIE): IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' + _GEO_BYPASS = False _HOST_TO_APPNAME = { 'vgtv.no': 'vgtv', @@ -217,7 +218,8 @@ class VGTVIE(XstreamIE): properties = try_get( data, lambda x: x['streamConfiguration']['properties'], list) if properties and 'geoblocked' in properties: - raise self.raise_geo_restricted() + raise self.raise_geo_restricted( + countries=[host.rpartition('.')[-1].partition('/')[0].upper()]) self._sort_formats(info['formats']) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 8a00c8fee..f0a7fd739 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -70,10 +70,10 @@ class ViceBaseIE(AdobePassIE): 'url': uplynk_preplay_url, 'id': video_id, 'title': title, - 'description': base.get('body'), + 'description': base.get('body') or base.get('display_body'), 'thumbnail': watch_hub_data.get('cover-image') or watch_hub_data.get('thumbnail'), - 'duration': parse_duration(video_data.get('video_duration') or watch_hub_data.get('video-duration')), - 'timestamp': int_or_none(video_data.get('created_at')), + 'duration': int_or_none(video_data.get('video_duration')) or parse_duration(watch_hub_data.get('video-duration')), + 'timestamp': int_or_none(video_data.get('created_at'), 1000), 'age_limit': parse_age_limit(video_data.get('video_rating')), 'series': video_data.get('show_title') or watch_hub_data.get('show-title'), 'episode_number': int_or_none(episode.get('episode_number') or watch_hub_data.get('episode')), diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index 0eff055a6..87f9216b5 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -7,16 +7,16 @@ from .vice import ViceBaseIE class VicelandIE(ViceBaseIE): _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)' _TEST = { - 'url': 'https://www.viceland.com/en_us/video/cyberwar-trailer/57608447973ee7705f6fbd4e', + 'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316', 'info_dict': { - 'id': '57608447973ee7705f6fbd4e', + 'id': '588a70d0dba8a16007de7316', 'ext': 'mp4', - 'title': 'CYBERWAR (Trailer)', - 'description': 'Tapping into the geopolitics of hacking and surveillance, Ben Makuch travels the world to meet with hackers, government officials, and dissidents to investigate the ecosystem of cyberwarfare.', + 'title': 'TRAPPED (Series Trailer)', + 'description': 'md5:7a8e95c2b6cd86461502a2845e581ccf', 'age_limit': 14, - 'timestamp': 1466008539, - 'upload_date': '20160615', - 'uploader_id': '11', + 'timestamp': 1485474122, + 'upload_date': '20170126', + 'uploader_id': '57a204098cb727dec794c6a3', 'uploader': 'Viceland', }, 'params': { diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 9950c62ad..1f1828fce 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .jwplatform import JWPlatformBaseIE +from .common import InfoExtractor from ..utils import ( decode_packed_codes, js_to_json, @@ -12,7 +12,7 @@ from ..utils import ( ) -class VidziIE(JWPlatformBaseIE): +class VidziIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' _TESTS = [{ 'url': 'http://vidzi.tv/cghql9yq6emu.html', diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 9c48701c1..e9c8bf824 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -27,6 +27,7 @@ class VikiBaseIE(InfoExtractor): _APP_VERSION = '2.2.5.1428709186' _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' + _GEO_BYPASS = False _NETRC_MACHINE = 'viki' _token = None @@ -77,8 +78,11 @@ class VikiBaseIE(InfoExtractor): def _check_errors(self, data): for reason, status in data.get('blocking', {}).items(): if status and reason in self._ERRORS: + message = self._ERRORS[reason] + if reason == 'geo': + self.raise_geo_restricted(msg=message) raise ExtractorError('%s said: %s' % ( - self.IE_NAME, self._ERRORS[reason]), expected=True) + self.IE_NAME, message), expected=True) def _real_initialize(self): self._login() diff --git a/youtube_dl/extractor/vodpl.py b/youtube_dl/extractor/vodpl.py new file mode 100644 index 000000000..9e919708e --- /dev/null +++ b/youtube_dl/extractor/vodpl.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .onet import OnetBaseIE + + +class VODPlIE(OnetBaseIE): + _VALID_URL = r'https?://vod\.pl/(?:[^/]+/)+(?P<id>[0-9a-zA-Z]+)' + + _TESTS = [{ + 'url': 'https://vod.pl/filmy/chlopaki-nie-placza/3ep3jns', + 'md5': 'a7dc3b2f7faa2421aefb0ecaabf7ec74', + 'info_dict': { + 'id': '3ep3jns', + 'ext': 'mp4', + 'title': 'Chłopaki nie płaczą', + 'description': 'md5:f5f03b84712e55f5ac9f0a3f94445224', + 'timestamp': 1463415154, + 'duration': 5765, + 'upload_date': '20160516', + }, + }, { + 'url': 'https://vod.pl/seriale/belfer-na-planie-praca-kamery-online/2c10heh', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info_dict = self._extract_from_id(self._search_mvp_id(webpage), webpage) + info_dict['id'] = video_id + return info_dict diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 54eb51427..c022fb33e 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals +from .common import InfoExtractor from .youtube import YoutubeIE -from .jwplatform import JWPlatformBaseIE -class WimpIE(JWPlatformBaseIE): +class WimpIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.wimp.com/maru-is-exhausted/', diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 11717fe98..5584674a0 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -44,6 +44,9 @@ class XTubeIE(InfoExtractor): }, { 'url': 'xtube:625837', 'only_matching': True, + }, { + 'url': 'xtube:kVTUy_G222_', + 'only_matching': True, }] def _real_extract(self, url): @@ -53,11 +56,16 @@ class XTubeIE(InfoExtractor): if not display_id: display_id = video_id - url = 'http://www.xtube.com/video-watch/-%s' % video_id - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1') - webpage = self._download_webpage(req, display_id) + if video_id.isdigit() and len(video_id) < 11: + url_pattern = 'http://www.xtube.com/video-watch/-%s' + else: + url_pattern = 'http://www.xtube.com/watch.php?v=%s' + + webpage = self._download_webpage( + url_pattern % video_id, display_id, headers={ + 'Cookie': 'age_verified=1; cookiesAccepted=1', + }) sources = self._parse_json(self._search_regex( r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),', @@ -73,7 +81,7 @@ class XTubeIE(InfoExtractor): self._sort_formats(formats) title = self._search_regex( - (r'<h1>(?P<title>[^<]+)</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), + (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), webpage, 'title', group='title') description = self._search_regex( r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 76710931a..dec02804b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -34,6 +34,7 @@ from ..utils import ( int_or_none, mimetype2ext, orderedSet, + parse_codecs, parse_duration, remove_quotes, remove_start, @@ -1696,15 +1697,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): codecs = mobj.group('val') break if codecs: - codecs = codecs.split(',') - if len(codecs) == 2: - acodec, vcodec = codecs[1], codecs[0] - else: - acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0]) - dct.update({ - 'acodec': acodec, - 'vcodec': vcodec, - }) + dct.update(parse_codecs(codecs)) formats.append(dct) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index a365923fb..523bb5c95 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -20,9 +20,9 @@ from ..utils import ( class ZDFBaseIE(InfoExtractor): - def _call_api(self, url, player, referrer, video_id): + def _call_api(self, url, player, referrer, video_id, item): return self._download_json( - url, video_id, 'Downloading JSON content', + url, video_id, 'Downloading JSON %s' % item, headers={ 'Referer': referrer, 'Api-Auth': 'Bearer %s' % player['apiToken'], @@ -104,7 +104,7 @@ class ZDFIE(ZDFBaseIE): }) formats.append(f) - def _extract_entry(self, url, content, video_id): + def _extract_entry(self, url, player, content, video_id): title = content.get('title') or content['teaserHeadline'] t = content['mainVideoContent']['http://zdf.de/rels/target'] @@ -116,7 +116,8 @@ class ZDFIE(ZDFBaseIE): 'http://zdf.de/rels/streams/ptmd-template'].replace( '{playerId}', 'portal') - ptmd = self._download_json(urljoin(url, ptmd_path), video_id) + ptmd = self._call_api( + urljoin(url, ptmd_path), player, url, video_id, 'metadata') formats = [] track_uris = set() @@ -174,8 +175,9 @@ class ZDFIE(ZDFBaseIE): } def _extract_regular(self, url, player, video_id): - content = self._call_api(player['content'], player, url, video_id) - return self._extract_entry(player['content'], content, video_id) + content = self._call_api( + player['content'], player, url, video_id, 'content') + return self._extract_entry(player['content'], player, content, video_id) def _extract_mobile(self, video_id): document = self._download_json( diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 349f44778..2c880d06a 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -228,17 +228,29 @@ def parseOpts(overrideArguments=None): action='store_const', const='::', dest='source_address', help='Make all connections via IPv6', ) - network.add_option( + + geo = optparse.OptionGroup(parser, 'Geo Restriction') + geo.add_option( '--geo-verification-proxy', dest='geo_verification_proxy', default=None, metavar='URL', help='Use this proxy to verify the IP address for some geo-restricted sites. ' - 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading.' - ) - network.add_option( + 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading.') + geo.add_option( '--cn-verification-proxy', dest='cn_verification_proxy', default=None, metavar='URL', - help=optparse.SUPPRESS_HELP, - ) + help=optparse.SUPPRESS_HELP) + geo.add_option( + '--geo-bypass', + action='store_true', dest='geo_bypass', default=True, + help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + geo.add_option( + '--no-geo-bypass', + action='store_false', dest='geo_bypass', default=True, + help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)') + geo.add_option( + '--geo-bypass-country', metavar='CODE', + dest='geo_bypass_country', default=None, + help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)') selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( @@ -298,14 +310,16 @@ def parseOpts(overrideArguments=None): metavar='FILTER', dest='match_filter', default=None, help=( 'Generic video filter. ' - 'Specify any key (see help for -o for a list of available keys) to' - ' match if the key is present, ' - '!key to check if the key is not present,' + 'Specify any key (see help for -o for a list of available keys) to ' + 'match if the key is present, ' + '!key to check if the key is not present, ' 'key > NUMBER (like "comment_count > 12", also works with ' - '>=, <, <=, !=, =) to compare against a number, and ' - '& to require multiple matches. ' - 'Values which are not known are excluded unless you' - ' put a question mark (?) after the operator.' + '>=, <, <=, !=, =) to compare against a number, ' + 'key = \'LITERAL\' (like "uploader = \'Mike Smith\'", also works with !=) ' + 'to match against a string literal ' + 'and & to require multiple matches. ' + 'Values which are not known are excluded unless you ' + 'put a question mark (?) after the operator. ' 'For example, to only match videos that have been liked more than ' '100 times and disliked less than 50 times (or the dislike ' 'functionality is not available at the given service), but who ' @@ -834,6 +848,7 @@ def parseOpts(overrideArguments=None): parser.add_option_group(general) parser.add_option_group(network) + parser.add_option_group(geo) parser.add_option_group(selection) parser.add_option_group(downloader) parser.add_option_group(filesystem) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1279a9042..17b83794a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -23,6 +23,7 @@ import operator import os import pipes import platform +import random import re import socket import ssl @@ -701,7 +702,12 @@ def bug_reports_message(): return msg -class ExtractorError(Exception): +class YoutubeDLError(Exception): + """Base exception for YoutubeDL errors.""" + pass + + +class ExtractorError(YoutubeDLError): """Error during info extraction.""" def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): @@ -742,7 +748,19 @@ class RegexNotFoundError(ExtractorError): pass -class DownloadError(Exception): +class GeoRestrictedError(ExtractorError): + """Geographic restriction Error exception. + + This exception may be thrown when a video is not available from your + geographic location due to geographic restrictions imposed by a website. + """ + def __init__(self, msg, countries=None): + super(GeoRestrictedError, self).__init__(msg, expected=True) + self.msg = msg + self.countries = countries + + +class DownloadError(YoutubeDLError): """Download Error exception. This exception may be thrown by FileDownloader objects if they are not @@ -756,7 +774,7 @@ class DownloadError(Exception): self.exc_info = exc_info -class SameFileError(Exception): +class SameFileError(YoutubeDLError): """Same File exception. This exception will be thrown by FileDownloader objects if they detect @@ -765,7 +783,7 @@ class SameFileError(Exception): pass -class PostProcessingError(Exception): +class PostProcessingError(YoutubeDLError): """Post Processing exception. This exception may be raised by PostProcessor's .run() method to @@ -773,15 +791,16 @@ class PostProcessingError(Exception): """ def __init__(self, msg): + super(PostProcessingError, self).__init__(msg) self.msg = msg -class MaxDownloadsReached(Exception): +class MaxDownloadsReached(YoutubeDLError): """ --max-downloads limit has been reached. """ pass -class UnavailableVideoError(Exception): +class UnavailableVideoError(YoutubeDLError): """Unavailable Format exception. This exception will be thrown when a video is requested @@ -790,7 +809,7 @@ class UnavailableVideoError(Exception): pass -class ContentTooShortError(Exception): +class ContentTooShortError(YoutubeDLError): """Content Too Short exception. This exception may be raised by FileDownloader objects when a file they @@ -799,12 +818,15 @@ class ContentTooShortError(Exception): """ def __init__(self, downloaded, expected): + super(ContentTooShortError, self).__init__( + 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected) + ) # Both in bytes self.downloaded = downloaded self.expected = expected -class XAttrMetadataError(Exception): +class XAttrMetadataError(YoutubeDLError): def __init__(self, code=None, msg='Unknown error'): super(XAttrMetadataError, self).__init__(msg) self.code = code @@ -820,7 +842,7 @@ class XAttrMetadataError(Exception): self.reason = 'NOT_SUPPORTED' -class XAttrUnavailableError(Exception): +class XAttrUnavailableError(YoutubeDLError): pass @@ -2383,6 +2405,7 @@ def _match_one(filter_part, dct): \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* (?: (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| + (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)| (?P<strval>(?![0-9.])[a-z0-9A-Z]*) ) \s*$ @@ -2391,7 +2414,8 @@ def _match_one(filter_part, dct): if m: op = COMPARISON_OPERATORS[m.group('op')] actual_value = dct.get(m.group('key')) - if (m.group('strval') is not None or + if (m.group('quotedstrval') is not None or + m.group('strval') is not None or # If the original field is a string and matching comparisonvalue is # a number we should respect the origin of the original field # and process comparison value as a string (see @@ -2401,7 +2425,10 @@ def _match_one(filter_part, dct): if m.group('op') not in ('=', '!='): raise ValueError( 'Operator %s does not support string values!' % m.group('op')) - comparison_value = m.group('strval') or m.group('intval') + comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval') + quote = m.group('quote') + if quote is not None: + comparison_value = comparison_value.replace(r'\%s' % quote, quote) else: try: comparison_value = int(m.group('intval')) @@ -3013,6 +3040,260 @@ class ISO3166Utils(object): return cls._country_map.get(code.upper()) +class GeoUtils(object): + # Major IPv4 address blocks per country + _country_ip_map = { + 'AD': '85.94.160.0/19', + 'AE': '94.200.0.0/13', + 'AF': '149.54.0.0/17', + 'AG': '209.59.64.0/18', + 'AI': '204.14.248.0/21', + 'AL': '46.99.0.0/16', + 'AM': '46.70.0.0/15', + 'AO': '105.168.0.0/13', + 'AP': '159.117.192.0/21', + 'AR': '181.0.0.0/12', + 'AS': '202.70.112.0/20', + 'AT': '84.112.0.0/13', + 'AU': '1.128.0.0/11', + 'AW': '181.41.0.0/18', + 'AZ': '5.191.0.0/16', + 'BA': '31.176.128.0/17', + 'BB': '65.48.128.0/17', + 'BD': '114.130.0.0/16', + 'BE': '57.0.0.0/8', + 'BF': '129.45.128.0/17', + 'BG': '95.42.0.0/15', + 'BH': '37.131.0.0/17', + 'BI': '154.117.192.0/18', + 'BJ': '137.255.0.0/16', + 'BL': '192.131.134.0/24', + 'BM': '196.12.64.0/18', + 'BN': '156.31.0.0/16', + 'BO': '161.56.0.0/16', + 'BQ': '161.0.80.0/20', + 'BR': '152.240.0.0/12', + 'BS': '24.51.64.0/18', + 'BT': '119.2.96.0/19', + 'BW': '168.167.0.0/16', + 'BY': '178.120.0.0/13', + 'BZ': '179.42.192.0/18', + 'CA': '99.224.0.0/11', + 'CD': '41.243.0.0/16', + 'CF': '196.32.200.0/21', + 'CG': '197.214.128.0/17', + 'CH': '85.0.0.0/13', + 'CI': '154.232.0.0/14', + 'CK': '202.65.32.0/19', + 'CL': '152.172.0.0/14', + 'CM': '165.210.0.0/15', + 'CN': '36.128.0.0/10', + 'CO': '181.240.0.0/12', + 'CR': '201.192.0.0/12', + 'CU': '152.206.0.0/15', + 'CV': '165.90.96.0/19', + 'CW': '190.88.128.0/17', + 'CY': '46.198.0.0/15', + 'CZ': '88.100.0.0/14', + 'DE': '53.0.0.0/8', + 'DJ': '197.241.0.0/17', + 'DK': '87.48.0.0/12', + 'DM': '192.243.48.0/20', + 'DO': '152.166.0.0/15', + 'DZ': '41.96.0.0/12', + 'EC': '186.68.0.0/15', + 'EE': '90.190.0.0/15', + 'EG': '156.160.0.0/11', + 'ER': '196.200.96.0/20', + 'ES': '88.0.0.0/11', + 'ET': '196.188.0.0/14', + 'EU': '2.16.0.0/13', + 'FI': '91.152.0.0/13', + 'FJ': '144.120.0.0/16', + 'FM': '119.252.112.0/20', + 'FO': '88.85.32.0/19', + 'FR': '90.0.0.0/9', + 'GA': '41.158.0.0/15', + 'GB': '25.0.0.0/8', + 'GD': '74.122.88.0/21', + 'GE': '31.146.0.0/16', + 'GF': '161.22.64.0/18', + 'GG': '62.68.160.0/19', + 'GH': '45.208.0.0/14', + 'GI': '85.115.128.0/19', + 'GL': '88.83.0.0/19', + 'GM': '160.182.0.0/15', + 'GN': '197.149.192.0/18', + 'GP': '104.250.0.0/19', + 'GQ': '105.235.224.0/20', + 'GR': '94.64.0.0/13', + 'GT': '168.234.0.0/16', + 'GU': '168.123.0.0/16', + 'GW': '197.214.80.0/20', + 'GY': '181.41.64.0/18', + 'HK': '113.252.0.0/14', + 'HN': '181.210.0.0/16', + 'HR': '93.136.0.0/13', + 'HT': '148.102.128.0/17', + 'HU': '84.0.0.0/14', + 'ID': '39.192.0.0/10', + 'IE': '87.32.0.0/12', + 'IL': '79.176.0.0/13', + 'IM': '5.62.80.0/20', + 'IN': '117.192.0.0/10', + 'IO': '203.83.48.0/21', + 'IQ': '37.236.0.0/14', + 'IR': '2.176.0.0/12', + 'IS': '82.221.0.0/16', + 'IT': '79.0.0.0/10', + 'JE': '87.244.64.0/18', + 'JM': '72.27.0.0/17', + 'JO': '176.29.0.0/16', + 'JP': '126.0.0.0/8', + 'KE': '105.48.0.0/12', + 'KG': '158.181.128.0/17', + 'KH': '36.37.128.0/17', + 'KI': '103.25.140.0/22', + 'KM': '197.255.224.0/20', + 'KN': '198.32.32.0/19', + 'KP': '175.45.176.0/22', + 'KR': '175.192.0.0/10', + 'KW': '37.36.0.0/14', + 'KY': '64.96.0.0/15', + 'KZ': '2.72.0.0/13', + 'LA': '115.84.64.0/18', + 'LB': '178.135.0.0/16', + 'LC': '192.147.231.0/24', + 'LI': '82.117.0.0/19', + 'LK': '112.134.0.0/15', + 'LR': '41.86.0.0/19', + 'LS': '129.232.0.0/17', + 'LT': '78.56.0.0/13', + 'LU': '188.42.0.0/16', + 'LV': '46.109.0.0/16', + 'LY': '41.252.0.0/14', + 'MA': '105.128.0.0/11', + 'MC': '88.209.64.0/18', + 'MD': '37.246.0.0/16', + 'ME': '178.175.0.0/17', + 'MF': '74.112.232.0/21', + 'MG': '154.126.0.0/17', + 'MH': '117.103.88.0/21', + 'MK': '77.28.0.0/15', + 'ML': '154.118.128.0/18', + 'MM': '37.111.0.0/17', + 'MN': '49.0.128.0/17', + 'MO': '60.246.0.0/16', + 'MP': '202.88.64.0/20', + 'MQ': '109.203.224.0/19', + 'MR': '41.188.64.0/18', + 'MS': '208.90.112.0/22', + 'MT': '46.11.0.0/16', + 'MU': '105.16.0.0/12', + 'MV': '27.114.128.0/18', + 'MW': '105.234.0.0/16', + 'MX': '187.192.0.0/11', + 'MY': '175.136.0.0/13', + 'MZ': '197.218.0.0/15', + 'NA': '41.182.0.0/16', + 'NC': '101.101.0.0/18', + 'NE': '197.214.0.0/18', + 'NF': '203.17.240.0/22', + 'NG': '105.112.0.0/12', + 'NI': '186.76.0.0/15', + 'NL': '145.96.0.0/11', + 'NO': '84.208.0.0/13', + 'NP': '36.252.0.0/15', + 'NR': '203.98.224.0/19', + 'NU': '49.156.48.0/22', + 'NZ': '49.224.0.0/14', + 'OM': '5.36.0.0/15', + 'PA': '186.72.0.0/15', + 'PE': '186.160.0.0/14', + 'PF': '123.50.64.0/18', + 'PG': '124.240.192.0/19', + 'PH': '49.144.0.0/13', + 'PK': '39.32.0.0/11', + 'PL': '83.0.0.0/11', + 'PM': '70.36.0.0/20', + 'PR': '66.50.0.0/16', + 'PS': '188.161.0.0/16', + 'PT': '85.240.0.0/13', + 'PW': '202.124.224.0/20', + 'PY': '181.120.0.0/14', + 'QA': '37.210.0.0/15', + 'RE': '139.26.0.0/16', + 'RO': '79.112.0.0/13', + 'RS': '178.220.0.0/14', + 'RU': '5.136.0.0/13', + 'RW': '105.178.0.0/15', + 'SA': '188.48.0.0/13', + 'SB': '202.1.160.0/19', + 'SC': '154.192.0.0/11', + 'SD': '154.96.0.0/13', + 'SE': '78.64.0.0/12', + 'SG': '152.56.0.0/14', + 'SI': '188.196.0.0/14', + 'SK': '78.98.0.0/15', + 'SL': '197.215.0.0/17', + 'SM': '89.186.32.0/19', + 'SN': '41.82.0.0/15', + 'SO': '197.220.64.0/19', + 'SR': '186.179.128.0/17', + 'SS': '105.235.208.0/21', + 'ST': '197.159.160.0/19', + 'SV': '168.243.0.0/16', + 'SX': '190.102.0.0/20', + 'SY': '5.0.0.0/16', + 'SZ': '41.84.224.0/19', + 'TC': '65.255.48.0/20', + 'TD': '154.68.128.0/19', + 'TG': '196.168.0.0/14', + 'TH': '171.96.0.0/13', + 'TJ': '85.9.128.0/18', + 'TK': '27.96.24.0/21', + 'TL': '180.189.160.0/20', + 'TM': '95.85.96.0/19', + 'TN': '197.0.0.0/11', + 'TO': '175.176.144.0/21', + 'TR': '78.160.0.0/11', + 'TT': '186.44.0.0/15', + 'TV': '202.2.96.0/19', + 'TW': '120.96.0.0/11', + 'TZ': '156.156.0.0/14', + 'UA': '93.72.0.0/13', + 'UG': '154.224.0.0/13', + 'US': '3.0.0.0/8', + 'UY': '167.56.0.0/13', + 'UZ': '82.215.64.0/18', + 'VA': '212.77.0.0/19', + 'VC': '24.92.144.0/20', + 'VE': '186.88.0.0/13', + 'VG': '172.103.64.0/18', + 'VI': '146.226.0.0/16', + 'VN': '14.160.0.0/11', + 'VU': '202.80.32.0/20', + 'WF': '117.20.32.0/21', + 'WS': '202.4.32.0/19', + 'YE': '134.35.0.0/16', + 'YT': '41.242.116.0/22', + 'ZA': '41.0.0.0/11', + 'ZM': '165.56.0.0/13', + 'ZW': '41.85.192.0/19', + } + + @classmethod + def random_ipv4(cls, code): + block = cls._country_ip_map.get(code.upper()) + if not block: + return None + addr, preflen = block.split('/') + addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] + addr_max = addr_min | (0xffffffff >> int(preflen)) + return compat_str(socket.inet_ntoa( + compat_struct_pack('!L', random.randint(addr_min, addr_max)))) + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1f84acfea..530e1856b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.02.11' +__version__ = '2017.02.17'