From a998cb2f7cb4f780cb583d48d55cf8f485d4fff5 Mon Sep 17 00:00:00 2001 From: yc0 Date: Wed, 11 Oct 2017 23:41:17 +0800 Subject: [PATCH] [XMovies8IE] Add new extractor --- youtube_dl/extractor/xmovies8.py | 348 ++++++++----------------------- 1 file changed, 90 insertions(+), 258 deletions(-) diff --git a/youtube_dl/extractor/xmovies8.py b/youtube_dl/extractor/xmovies8.py index 6a8decc88..394700632 100644 --- a/youtube_dl/extractor/xmovies8.py +++ b/youtube_dl/extractor/xmovies8.py @@ -1,53 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals -import re, time,operator +import re +import time +import operator + from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_str, - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( - clean_html, - urljoin, compat_urlparse, - ExtractorError, - sanitized_Request, - update_Request + ExtractorError ) + + def urljoin(*args): """ Joins given arguments into a url. Trailing but not leading slashes are stripped for each argument. + + The urljoin in utils is not suitable for me. + I do not want to join url with the base url. + I only want to concat two paths without duplicate slashs """ return "/".join(map(lambda x: str(x).rstrip('/'), args)) -def cookie_to_dict(cookie): - cookie_dict = { - 'name': cookie.name, - 'value': cookie.value, - } - if cookie.port_specified: - cookie_dict['port'] = cookie.port - if cookie.domain_specified: - cookie_dict['domain'] = cookie.domain - if cookie.path_specified: - cookie_dict['path'] = cookie.path - if cookie.expires is not None: - cookie_dict['expires'] = cookie.expires - if cookie.secure is not None: - cookie_dict['secure'] = cookie.secure - if cookie.discard is not None: - cookie_dict['discard'] = cookie.discard - try: - if (cookie.has_nonstandard_attr('httpOnly') or - cookie.has_nonstandard_attr('httponly') or - cookie.has_nonstandard_attr('HttpOnly')): - cookie_dict['httponly'] = True - except TypeError: - pass - return cookie_dict + + def evaluate_expression(expr): """Evaluate a Javascript expression for the challange and return its value""" stack = [] @@ -55,7 +35,7 @@ def evaluate_expression(expr): value = "" for index, char in enumerate(expr): if char == "(": - stack.append(index+1) + stack.append(index + 1) elif char == ")": begin = stack.pop() if stack: @@ -66,19 +46,23 @@ def evaluate_expression(expr): num += expression_values[part] value += str(num) return int(value) - + + operator_functions = { "+": operator.add, "-": operator.sub, "*": operator.mul, } - + + expression_values = { "": 0, "+": 0, "!+": 1, "+!!": 1, } + + class XMovies8IE(InfoExtractor): _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _VALID_URL = r'''(?x) @@ -89,7 +73,7 @@ class XMovies8IE(InfoExtractor): ''' _TEST = { 'url': 'https://xmovies8.es/movie/the-hitman-s-bodyguard-2017.58852', - + # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', 'info_dict': { @@ -99,88 +83,25 @@ class XMovies8IE(InfoExtractor): 'description': "The world's top bodyguard gets a new client, a hit man who must testify at the International Court of Justice. They must put their differences aside and work together to make it to the trial on time.", 'thumbnail': 'https://img.xmovies88.stream/crop/215/310/media/imagesv2/2017/08/the-hitman-s-bodyguard-2017-poster.jpg', 'formats': [{ - 'format_id': '1287', - 'url': 'https://s4.ostr.tv/hls/qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa/index-v1-a1.m3u8', - 'manifest_url': 'https://s4.ostr.tv/hls/,qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa,.urlset/master.m3u8', - 'tbr': 1287.551, - 'ext': 'mp4', - 'fps': 23.974, - 'protocol': 'm3u8', - 'preference': None, - 'width': 1280, - 'height': 720, - 'vcodec': 'avc1.64001f', + 'format_id': '1287', + 'url': 'https://s4.ostr.tv/hls/qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa/index-v1-a1.m3u8', + 'manifest_url': 'https://s4.ostr.tv/hls/,qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa,.urlset/master.m3u8', + 'tbr': 1287.551, + 'ext': 'mp4', + 'fps': 23.974, + 'protocol': 'm3u8', + 'preference': None, + 'width': 1280, + 'height': 720, + 'vcodec': 'avc1.64001f', 'acodec': 'mp4a.40.2'}] }, - # 'info_dict': { - # 'id': '36164052', - # 'ext': 'flv', - # 'title': '데일리 에이프릴 요정들의 시상식!', - # 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - # 'uploader': 'dailyapril', - # 'uploader_id': 'dailyapril', - # 'upload_date': '20160503', - # }, 'params': { - # m3u8 download 'skip_download': True, } } - def _get_cv(self,ct, host_name): - #ct = ct.replace('\n', '').replace('\r', '') - #find all hidden form value - hidden = re.findall('', ct)[0] - # get var name - # var t,r,a,f, kMuTlpA={"t":+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]))}; - _, n, m, v = re.findall('var (:?[^,]+,)+ ([^=]+)={"([^"]+)":([^}]+)};', ct, re.DOTALL)[0] - v = self._calc_symbol(v) - # call eval() to calc expression - for op, arg in re.findall('%s\.%s(.)=([^;]+);' % (n, m), ct): - v = eval('%d %s %d' % (v, op, self._calc_symbol(arg))) - # t = re.findall('\+\s*([^\.]+)\.length', ct, re.DOTALL)[0] - # print '%s\.innerHTML\s*=\s*"([^"])";' % t - # new_len = len(re.findall('%s\.innerHTML\s*=\s*"([^"]+)";' % t, ct, re.DOTALL)[0]) - # here we assume the meaning of t in defintely hostname, cf may change in the future - v += len(host_name) - # get wait time - wait = re.findall('}, (\d+)\);', ct, re.DOTALL)[0] - return hidden, v, url, wait - def _calc_symbol(self,s): - _ = re.findall('\+?\(\(([^\)]+)\)\+\(([^\)]+)\)\)', s) - #type 1 +((...)+(...)) 2-digit num - if _: - v1, v2 = map(self._calc_symbol, _[0]) - return int(str(v1)+str(v2)) - #type 2 plain - else: - # use look-up table to replace - vmap = {'!':1, '[]':0, '!![]':1, '':0} - return sum(map(lambda x:vmap[x], s.split('+'))) - def _pycfl(self,s): - # !+[] 1 - # !![] 1 - # ![] 0 - # [] 0 - result = '' - # print(s) # DEBUG - ss = re.split('\(|\)', s) - for s in ss: - if s in ('+', ''): - continue - elif s[0] == '+': - s = s[1:] - s = s.replace('!+[]', '1') - s = s.replace('!![]', '1') - s = s.replace('![]', '0') - s = s.replace('[]', '0') - s = s.replace('+!![]', '10') - result += str(sum([int(i) for i in s.split('+')])) - return result - def _extract_all(self,txt, rules, pos=0, values=None): + def _extract_all(self, txt, rules, pos=0, values=None): """Calls extract for each rule and returns the result in a dict""" if values is None: values = {} @@ -189,22 +110,22 @@ class XMovies8IE(InfoExtractor): if key: values[key] = result return values, pos - def _extract(self,txt, begin, end, pos=0): + + def _extract(self, txt, begin, end, pos=0): """Extract the text between 'begin' and 'end' from 'txt' - + Args: txt: String to search in begin: First string to be searched for end: Second string to be searched for after 'begin' pos: Starting position for searches in 'txt' - + Returns: The string between the two search-strings 'begin' and 'end' beginning with position 'pos' in 'txt' as well as the position after 'end'. - If at least one of 'begin' or 'end' is not found, None and the original value of 'pos' is returned - + Examples: extract("abcde", "b", "d") -> "c" , 4 extract("abcde", "b", "d", 3) -> None, 3 @@ -212,36 +133,34 @@ class XMovies8IE(InfoExtractor): try: first = txt.index(begin, pos) + len(begin) last = txt.index(end, first) - return txt[first:last], last+len(end) + return txt[first:last], last + len(end) except ValueError: return None, pos - - def _solve_challenge(self, req,headers=None): + + def _solve_challenge(self, url, headers=None): try: self._request_webpage( - req, None, note='Solve Challenge',headers=headers) + url, None, note='Solving Challenge', headers=headers) except ExtractorError as ee: if not isinstance(ee.cause, compat_HTTPError) or ee.cause.code != 503: raise page = ee.cause.read().decode('utf-8') params = self._extract_all(page, ( ('jschl_vc', 'name="jschl_vc" value="', '"'), - ('pass' , 'name="pass" value="', '"'), + ('pass', 'name="pass" value="', '"'), ))[0] - params["jschl_answer"] = self._solve_jschl(req.full_url, page) + params["jschl_answer"] = self._solve_jschl(url, page) time.sleep(4) - print("params : ",params) - req = update_Request(req,urljoin(req.full_url,"/cdn-cgi/l/chk_jschl"),query=params) - self._request_webpage( - req, None, note='Downloading redirect page',headers=headers,fatal=False) - return req - # session.get(urllib.parse.urljoin(url, "/cdn-cgi/l/chk_jschl"), params=params) - # return session.cookies - def _solve_jschl(self,url, page): + # print("params : ", params) + rst = self._request_webpage( + urljoin(url, "/cdn-cgi/l/chk_jschl"), None, note='Downloading redirect page', headers=headers, fatal=False, query=params) + return rst + + def _solve_jschl(self, url, page): """Solve challenge to get 'jschl_answer' value""" data, pos = self._extract_all(page, ( - ('var' , ',f, ', '='), - ('key' , '"', '"'), + ('var', ',f, ', '='), + ('key', '"', '"'), ('expr', ':', '}') )) solution = evaluate_expression(data["expr"]) @@ -251,161 +170,74 @@ class XMovies8IE(InfoExtractor): for expr in expressions.split(";")[1:]: if expr.startswith(variable): func = operator_functions[expr[vlength]] - value = evaluate_expression(expr[vlength+2:]) + value = evaluate_expression(expr[vlength + 2:]) solution = func(solution, value) elif expr.startswith("a.value"): return solution + len(compat_urllib_parse_urlparse(url).netloc) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') isWatching = mobj.group('isWatching') - print("original :", url) - # url = compat_urlparse.urljoin(url, "/watching") if not isWatching else url - base_url = compat_urlparse.urljoin(url,"/") - print("base :", base_url) + # print("original :", url) + base_url = compat_urlparse.urljoin(url, "/") + # print("base :", base_url) parsed_url = compat_urllib_parse_urlparse(url) - print("after parsed:", parsed_url) + # print("after parsed:", parsed_url) headers = { 'User-Agent': self._USER_AGENT, - # 'Cookie':'__cfduid='+cfduid, - 'Referer':'http://'+parsed_url.netloc+'/', - # 'Host':parsed_url.netloc + 'Referer': 'http://' + parsed_url.netloc + '/', } - req = sanitized_Request(base_url) - self._solve_challenge(req,headers) + self._solve_challenge(base_url, headers) try: - - path = urljoin(parsed_url.path,"watching.html") if not isWatching else parsed_url.path - #print(path) - print(compat_urlparse.urljoin(base_url,path)) - webpage = self._download_webpage(compat_urlparse.urljoin(base_url,path), video_id, headers=headers) - # self.to_screen(webpage) - # title = self._html_search_regex(r'
(.+)',webpage,'title', fatal=False) - # self.to_screen(webpage) - - title = self._html_search_regex(r'(?is)]+prop="name" content="([^"]+)',webpage,'title', fatal=False) - description = self._html_search_regex(r'(?is)]+prop="description" content="([^"]+)',webpage,'description', fatal=False) - duration = self._html_search_regex(r'(?is)]+prop="duration" content="([^"]+)',webpage,'duration', fatal=False) - thumbnailUrl = self._html_search_regex(r'(?is)]+prop="thumbnailUrl" href="([^"]+)',webpage,'thumbnailUrl', fatal=False) + path = urljoin(parsed_url.path, "watching.html") if not isWatching else parsed_url.path + # print(compat_urlparse.urljoin(base_url, path)) + webpage = self._download_webpage(compat_urlparse.urljoin(base_url, path), video_id, headers=headers) + title = self._html_search_regex(r'(?is)]+prop="name" content="([^"]+)', webpage, 'title', fatal=False) + description = self._html_search_regex(r'(?is)]+prop="description" content="([^"]+)', webpage, 'description', fatal=False) + # duration = self._html_search_regex(r'(?is)]+prop="duration" content="([^"]+)', webpage, 'duration', fatal=False) + thumbnailUrl = self._html_search_regex(r'(?is)]+prop="thumbnailUrl" href="([^"]+)', webpage, 'thumbnailUrl', fatal=False) - player_id = self._html_search_regex(r'[^}]+else[^{]+{.*load_player\(\'(\d+)\'[^\)]*',webpage,'player_id', fatal=False) - movie_id = self._html_search_regex(r']+/javascript\"> var movie = { id: (\d+),',webpage,'movie_id', fatal=False) + player_id = self._html_search_regex(r'[^}]+else[^{]+{.*load_player\(\'(\d+)\'[^\)]*', webpage, 'player_id', fatal=False) + movie_id = self._html_search_regex(r']+/javascript\"> var movie = { id: (\d+),', webpage, 'movie_id', fatal=False) - print(compat_urlparse.urljoin(base_url,"/ajax/movie/load_player_v3")) - load_player_v3 = self._download_json(compat_urlparse.urljoin(base_url,"/ajax/movie/load_player_v3"),video_id,headers=headers,query={'id':player_id}) + # print(compat_urlparse.urljoin(base_url, "/ajax/movie/load_player_v3")) + load_player_v3 = self._download_json(compat_urlparse.urljoin(base_url, "/ajax/movie/load_player_v3"), video_id, note="Downloading player v3", headers=headers, query={'id': player_id}) - print(title) - print(player_id) - print(load_player_v3) - print(load_player_v3.get('value')) + # print(title) + # print(player_id) + # print(load_player_v3) + # print(load_player_v3.get('value')) - playlist = self._download_json(parsed_url.scheme+":"+load_player_v3.get('value'),video_id,headers=headers) - print(playlist) + playlist = self._download_json(parsed_url.scheme + ":" + load_player_v3.get('value'), video_id, note="Downloading video format", headers=headers) + # print(playlist) formats = None for play in playlist.get('playlist'): - print(play.get('file')) + # print(play.get('file')) # m3u8_formats = self._extract_m3u8_formats(play.get('file'),video_id) - formats = self._extract_m3u8_formats(play.get('file'),video_id,"mp4") - print(formats) - if not formats and error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + formats = self._extract_m3u8_formats(play.get('file'), video_id, "mp4") + # print(formats) + self._sort_formats(formats) - print({ - 'id': movie_id, - 'title': title, - 'ext':formats[0].get('ext'), - 'description': description, - 'thumbnail': thumbnailUrl, - 'formats': formats - }) + # print({ + # 'id': movie_id, + # 'title': title, + # 'ext': formats[0].get('ext'), + # 'description': description, + # 'thumbnail': thumbnailUrl, + # 'formats': formats + # }) return { 'id': movie_id, 'title': title, - 'ext':formats[0].get('ext'), + 'ext': formats[0].get('ext'), 'description': description, 'thumbnail': thumbnailUrl, 'formats': formats } except ExtractorError as ee: - print("OOOOOO") - print(ee) if not isinstance(ee.cause, compat_HTTPError) or \ ee.cause.code != 503: self.to_screen(ee.cause.read().decode('utf-8')) raise - redir_webpage = ee.cause.read().decode('utf-8') - cfduid = self._get_cookies(parsed_url.netloc).get('__cfduid').value - self._set_cookie(parsed_url.netloc,'__cfduid',cfduid) - - c, v, u, w = self._get_cv(redir_webpage, parsed_url.netloc) - print(c,v,u,w) - # action = self._search_regex( - # r'
', - # redir_webpage, 'redirect vc value') - # pwd = self._search_regex( - # r'', - # redir_webpage, 'redirect pass value') - # av = re.search( - # r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', - # redir_webpage) - # init = re.search( - # r''' - # (?sx)setTimeout\((?:.)*var\s+(?:[a-z],)*\s+(?P[a-zA-Z]*)={\"(?P[a-zA-Z]*)\":(?P[\(\)!\[\]\+]*) - # ''' - # ,redir_webpage) - - # ans = int(self._pycfl(init.group('init'))) - # for content in re.finditer(r''+init.group('dict')+'\.'+init.group('key')+'(?P[+\-\*/])=(?P[\(\)!\[\]\+]*);',redir_webpage): - # if '*' == content.group('oper'): - # ans *= int(self._pycfl(content.group('val'))) - # elif '+' == content.group('oper'): - # ans += int(self._pycfl(content.group('val'))) - # elif '-' == content.group('oper'): - # ans -= int(self._pycfl(content.group('val'))) - # elif '/' == content.group('oper'): - # ans /= int(self._pycfl(content.group('val'))) - - # ans += len(parsed_url.netloc) - # confirm_url = ( - # parsed_url.scheme + '://' + parsed_url.netloc + - # action + '?' + - # compat_urllib_parse_urlencode({ - # 'jschl_vc': vc, - # # 'pass': pwd, - # 'jschl_answer': compat_str(ans) - # }) - # ) - try: - time.sleep(int(w)//1000) - urlh = self._request_webpage( - req, None, note='Downloading redirect page',headers=headers,fatal=False) - # print('%s://%s%s?%s&jschl_answer=%s' % (parsed_url.scheme, parsed_url.netloc,u, c, v)) - # print(confirm_url) - - # webpage, url_handle = self._download_webpage_handle( - # confirm_url, None, 'Downloading login page',headers=headers) - # webpage = self._download_webpage( - # confirm_url, video_id, - # note='Confirming after redirect', - # headers=headers) - - self.to_screen(webpage) - # title = self._html_search_regex(r'
]+

(.+?)

', webpage, 'title', fatal=False) - # print(title) - return { - 'id': video_id, - # 'title': title, - 'description': self._og_search_description(webpage), - # 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), - # TODO more properties (see youtube_dl/extractor/common.py) - } - except ExtractorError as ee: - if not isinstance(ee.cause, compat_HTTPError) or \ - ee.cause.code != 503: - raise - webpage = ee.cause.read().decode('utf-8') -