From d18c635985c03147c1af108fdab70336dc632a57 Mon Sep 17 00:00:00 2001 From: sulyi Date: Sun, 20 Nov 2016 21:09:55 +0100 Subject: [PATCH 1/2] [YouWatch] Add new extractor (for testing purpose) --- docs/supportedsites.md | 1 + youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/youwatch.py | 178 +++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+) create mode 100644 youtube_dl/extractor/youwatch.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 77832504a..522ae4d4e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -937,6 +937,7 @@ - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **YouWatch** - **Zapiks** - **ZDF** - **ZDFChannel** diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 578359a5e..7b4548606 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1190,6 +1190,7 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) +from .youwatch import YouWatchIE from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/youwatch.py b/youtube_dl/extractor/youwatch.py new file mode 100644 index 000000000..cbfe5f2e1 --- /dev/null +++ b/youtube_dl/extractor/youwatch.py @@ -0,0 +1,178 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import sys + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, int_or_none, + unified_strdate, + js_to_json) + + +class YouWatchIE(InfoExtractor): + _VALID_URL = r'^(https?://(?:www\.)?youwatch\.org/)(?Pembed-)?(?P[a-z0-9]+)(\.html)?$' + _TESTS = [ + { + 'url': 'http://youwatch.org/ncvag7qib06a', + 'md5': 'fcc3d1b77d41921ab408ddfbc51604b1', + 'info_dict': { + 'id': 'ncvag7qib06a', + 'ext': 'mp4', + 'title': 'A 4 mesterl v sz', + 'thumbnail': 'http://212.7.205.2/i/03/00000/ncvag7qib06a.jpg', + 'upload_date': '20140710', + 'view_count': int + } + }, { + 'url': 'http://youwatch.org/r4gmiun6qx57', + 'md5': '71a151e542ae86a023d8cc57e243a917', + 'info_dict': { + 'id': 'r4gmiun6qx57', + 'ext': 'mp4', + 'title': 'A h�z 2 A m�sodik t�rt�net', + 'thumbnail': 'http://212.7.211.67/i/05/00000/r4gmiun6qx57.jpg', + 'upload_date': '20150509', + 'view_count': int + } + }, { + 'url': 'http://youwatch.org/t179wlmqbndd', + 'md5': '1984ebed1fb5fee33aa7076a73faffa3', + 'info_dict': { + 'id': 't179wlmqbndd', + 'ext': 'mp4', + 'title': 'bd-ijf', + 'thumbnail': 'http://212.7.211.67/i/04/00000/t179wlmqbndd.jpg', + 'upload_date': '20150523', + 'view_count': int + } + }, { + 'url': 'http://youwatch.org/embed-2a3y6svmsofw.html', + 'md5': 'ded91fbca8c913d493263ae26e5e18d1', + 'info_dict': { + 'id': '2a3y6svmsofw', + 'ext': 'mp4', + 'title': 'A Vadnyugat v�gnapjai (2008)', + 'thumbnail': 'http://212.7.211.68/i/05/00362/0iy24azid2xj.jpg', + 'upload_date': '20140223', + 'view_count': int + } + } + ] + + def extract_arguments(self, call, code): + if not call.endswith(')'): + pattern = r'%s\s*\(' % re.escape(call) + else: + pattern = re.escape(call) + mobj = re.search(pattern, code) + + if mobj: + # XXX: context-free! + close_pos = open_pos = mobj.end() + counter = 1 + while counter > 0: + if close_pos > len(code): + break + c = code[close_pos] + close_pos += 1 + if c == '(': + counter += 1 + elif c == ')': + counter -= 1 + else: + return code[open_pos:close_pos - 1] + + @staticmethod + def __v_rot(text): + if text is None: + return None + rotated = '' + for letter in text: + l_code = ord(letter) + if 64 < l_code < 91: # ord('A'): 65 ord('Z'): 90 + rotated += chr((l_code - 65 - 13) % 26 + 65) + elif 96 < l_code < 123: # ord('a'): 97 ord('z'): 122 + rotated += chr((l_code - 97 - 13) % 26 + 97) + else: + rotated += letter + return rotated + + def _real_extract(self, url): + video_id = self._match_id(url) + embed_url = None + if self._VALID_URL_RE.match(url).group('embed') is not None: + embed_url = url + url = self._VALID_URL_RE.sub(r'\1\g', url) + webpage = self._download_webpage(url, video_id) + + if 'This server is in maintenance mode. Refresh this page in some minutes.' in webpage: + raise ExtractorError('Video is temporally unavailable. ', + sys.exc_info()[2], True, video_id=video_id) + if 'The file you were looking for could not be found, sorry for any inconvenience.' in webpage: + raise ExtractorError('Video is gone. ', + sys.exc_info()[2], True, video_id=video_id) + + title = self._html_search_regex(r'''\s*(?P.+?)\s*<\s*/''', + webpage, 'title', fatal=False, + flags=re.IGNORECASE, group='title') + title = self.__v_rot(title) + + ul_date = self._html_search_regex(r'''<i\s.*class=("|')fa fa-calendar\1.*>\s*</i>\s*on\s+(?P<date>.*)\s*<''', + webpage, 'upload date', fatal=False, + flags=re.MULTILINE | re.IGNORECASE, group='date') + ul_date = unified_strdate(ul_date, day_first=False) + + views = self._html_search_regex(r'''<\s*([^\s]+).*title=("|')views\2.*>\s*(?P<count>.*)\s*<\s*\/\s*\1\s*>''', + webpage, 'views', fatal=False, + flags=re.IGNORECASE, group='count') + views = int_or_none(views) + + if embed_url is None: + embed_url = self._html_search_regex( + r'''<iframe[^>]+class=("|')embed-responsive-item\1[^>]+src=("|')(?P<url>((?!\2).)+)\2''', + webpage, 'embed url', flags=re.IGNORECASE, group='url') + + embed_html = self._download_webpage(embed_url, video_id) + ref_url = self._html_search_regex( + r'''<iframe[^>]+src=("|')(?P<url>((?!\1).)+)\1''', + embed_html, 'referer', flags=re.IGNORECASE, group='url') + + embed_html = self._download_webpage(ref_url, video_id) + + jwplayer_setup_script = self._html_search_regex( + r'''<span\b.*id=("|')vplayer\1[^>]*>\s*<img\b[^>]*>\s*</span>\s*''' + + r'''<script\b[^>]*type=("|')((?!\2).)*\2[^>]*>(?P<script>((?!</script>).|\s)*)''', + embed_html, 'jwplayer setup script', flags=re.IGNORECASE, group='script' + ) + + jwplayer_setup = self._html_search_regex(r'(jwplayer\s*\((.|\n)*\)\.setup)', + jwplayer_setup_script, 'jwplayer') + + jwplayer_json = self.extract_arguments(jwplayer_setup, jwplayer_setup_script) + jwplayer_json = js_to_json(jwplayer_json) + # fix unbalanced commas + jwplayer_json = re.sub(r',\s*([}\]])', '\g<1>', jwplayer_json) + jwplayer_json = self._parse_json(jwplayer_json, video_id) + + video_urls = [ + { + 'url': source['file'], + 'format_id': source['label'], + 'ext': determine_ext(source['file']) + } for source in jwplayer_json['sources'] + ] + + info_dict = {'id': video_id, 'formats': video_urls, 'http_headers': {'Referer': ref_url}} + if jwplayer_json['image'] is not None: + info_dict['thumbnail'] = jwplayer_json['image'] + if title is not None: + info_dict['title'] = title + if ul_date is not None: + info_dict['upload_date'] = ul_date + if views is not None: + info_dict['view_count'] = views + + return info_dict From c6df9d4863d90dcd2e7cb1f6c54eb6d712b09591 Mon Sep 17 00:00:00 2001 From: sulyi <sulyi.gbox@gmail.com> Date: Sun, 20 Nov 2016 23:07:41 +0100 Subject: [PATCH 2/2] [YouWatch] Clean up depreciated json fix --- youtube_dl/extractor/youwatch.py | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/youwatch.py b/youtube_dl/extractor/youwatch.py index cbfe5f2e1..6f0522d9c 100644 --- a/youtube_dl/extractor/youwatch.py +++ b/youtube_dl/extractor/youwatch.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re import sys +from ..jsinterp import JSInterpreter from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -62,29 +63,6 @@ class YouWatchIE(InfoExtractor): } ] - def extract_arguments(self, call, code): - if not call.endswith(')'): - pattern = r'%s\s*\(' % re.escape(call) - else: - pattern = re.escape(call) - mobj = re.search(pattern, code) - - if mobj: - # XXX: context-free! - close_pos = open_pos = mobj.end() - counter = 1 - while counter > 0: - if close_pos > len(code): - break - c = code[close_pos] - close_pos += 1 - if c == '(': - counter += 1 - elif c == ')': - counter -= 1 - else: - return code[open_pos:close_pos - 1] - @staticmethod def __v_rot(text): if text is None: @@ -151,10 +129,9 @@ class YouWatchIE(InfoExtractor): jwplayer_setup = self._html_search_regex(r'(jwplayer\s*\((.|\n)*\)\.setup)', jwplayer_setup_script, 'jwplayer') - jwplayer_json = self.extract_arguments(jwplayer_setup, jwplayer_setup_script) + js = JSInterpreter(jwplayer_setup_script) + jwplayer_json = js.extract_arguments(jwplayer_setup) jwplayer_json = js_to_json(jwplayer_json) - # fix unbalanced commas - jwplayer_json = re.sub(r',\s*([}\]])', '\g<1>', jwplayer_json) jwplayer_json = self._parse_json(jwplayer_json, video_id) video_urls = [