diff --git a/youtube_dl/extractor/xmovies8.py b/youtube_dl/extractor/xmovies8.py index 6a8decc88..394700632 100644 --- a/youtube_dl/extractor/xmovies8.py +++ b/youtube_dl/extractor/xmovies8.py @@ -1,53 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals -import re, time,operator +import re +import time +import operator + from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_str, - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( - clean_html, - urljoin, compat_urlparse, - ExtractorError, - sanitized_Request, - update_Request + ExtractorError ) + + def urljoin(*args): """ Joins given arguments into a url. Trailing but not leading slashes are stripped for each argument. + + The urljoin in utils is not suitable for me. + I do not want to join url with the base url. + I only want to concat two paths without duplicate slashs """ return "/".join(map(lambda x: str(x).rstrip('/'), args)) -def cookie_to_dict(cookie): - cookie_dict = { - 'name': cookie.name, - 'value': cookie.value, - } - if cookie.port_specified: - cookie_dict['port'] = cookie.port - if cookie.domain_specified: - cookie_dict['domain'] = cookie.domain - if cookie.path_specified: - cookie_dict['path'] = cookie.path - if cookie.expires is not None: - cookie_dict['expires'] = cookie.expires - if cookie.secure is not None: - cookie_dict['secure'] = cookie.secure - if cookie.discard is not None: - cookie_dict['discard'] = cookie.discard - try: - if (cookie.has_nonstandard_attr('httpOnly') or - cookie.has_nonstandard_attr('httponly') or - cookie.has_nonstandard_attr('HttpOnly')): - cookie_dict['httponly'] = True - except TypeError: - pass - return cookie_dict + + def evaluate_expression(expr): """Evaluate a Javascript expression for the challange and return its value""" stack = [] @@ -55,7 +35,7 @@ def evaluate_expression(expr): value = "" for index, char in enumerate(expr): if char == "(": - stack.append(index+1) + stack.append(index + 1) elif char == ")": begin = stack.pop() if stack: @@ -66,19 +46,23 @@ def evaluate_expression(expr): num += expression_values[part] value += str(num) return int(value) - + + operator_functions = { "+": operator.add, "-": operator.sub, "*": operator.mul, } - + + expression_values = { "": 0, "+": 0, "!+": 1, "+!!": 1, } + + class XMovies8IE(InfoExtractor): _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _VALID_URL = r'''(?x) @@ -89,7 +73,7 @@ class XMovies8IE(InfoExtractor): ''' _TEST = { 'url': 'https://xmovies8.es/movie/the-hitman-s-bodyguard-2017.58852', - + # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', 'info_dict': { @@ -99,88 +83,25 @@ class XMovies8IE(InfoExtractor): 'description': "The world's top bodyguard gets a new client, a hit man who must testify at the International Court of Justice. They must put their differences aside and work together to make it to the trial on time.", 'thumbnail': 'https://img.xmovies88.stream/crop/215/310/media/imagesv2/2017/08/the-hitman-s-bodyguard-2017-poster.jpg', 'formats': [{ - 'format_id': '1287', - 'url': 'https://s4.ostr.tv/hls/qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa/index-v1-a1.m3u8', - 'manifest_url': 'https://s4.ostr.tv/hls/,qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa,.urlset/master.m3u8', - 'tbr': 1287.551, - 'ext': 'mp4', - 'fps': 23.974, - 'protocol': 'm3u8', - 'preference': None, - 'width': 1280, - 'height': 720, - 'vcodec': 'avc1.64001f', + 'format_id': '1287', + 'url': 'https://s4.ostr.tv/hls/qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa/index-v1-a1.m3u8', + 'manifest_url': 'https://s4.ostr.tv/hls/,qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa,.urlset/master.m3u8', + 'tbr': 1287.551, + 'ext': 'mp4', + 'fps': 23.974, + 'protocol': 'm3u8', + 'preference': None, + 'width': 1280, + 'height': 720, + 'vcodec': 'avc1.64001f', 'acodec': 'mp4a.40.2'}] }, - # 'info_dict': { - # 'id': '36164052', - # 'ext': 'flv', - # 'title': '데일리 에이프릴 요정들의 시상식!', - # 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - # 'uploader': 'dailyapril', - # 'uploader_id': 'dailyapril', - # 'upload_date': '20160503', - # }, 'params': { - # m3u8 download 'skip_download': True, } } - def _get_cv(self,ct, host_name): - #ct = ct.replace('\n', '').replace('\r', '') - #find all hidden form value - hidden = re.findall('', ct)[0] - # get var name - # var t,r,a,f, kMuTlpA={"t":+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]))}; - _, n, m, v = re.findall('var (:?[^,]+,)+ ([^=]+)={"([^"]+)":([^}]+)};', ct, re.DOTALL)[0] - v = self._calc_symbol(v) - # call eval() to calc expression - for op, arg in re.findall('%s\.%s(.)=([^;]+);' % (n, m), ct): - v = eval('%d %s %d' % (v, op, self._calc_symbol(arg))) - # t = re.findall('\+\s*([^\.]+)\.length', ct, re.DOTALL)[0] - # print '%s\.innerHTML\s*=\s*"([^"])";' % t - # new_len = len(re.findall('%s\.innerHTML\s*=\s*"([^"]+)";' % t, ct, re.DOTALL)[0]) - # here we assume the meaning of t in defintely hostname, cf may change in the future - v += len(host_name) - # get wait time - wait = re.findall('}, (\d+)\);', ct, re.DOTALL)[0] - return hidden, v, url, wait - def _calc_symbol(self,s): - _ = re.findall('\+?\(\(([^\)]+)\)\+\(([^\)]+)\)\)', s) - #type 1 +((...)+(...)) 2-digit num - if _: - v1, v2 = map(self._calc_symbol, _[0]) - return int(str(v1)+str(v2)) - #type 2 plain - else: - # use look-up table to replace - vmap = {'!':1, '[]':0, '!![]':1, '':0} - return sum(map(lambda x:vmap[x], s.split('+'))) - def _pycfl(self,s): - # !+[] 1 - # !![] 1 - # ![] 0 - # [] 0 - result = '' - # print(s) # DEBUG - ss = re.split('\(|\)', s) - for s in ss: - if s in ('+', ''): - continue - elif s[0] == '+': - s = s[1:] - s = s.replace('!+[]', '1') - s = s.replace('!![]', '1') - s = s.replace('![]', '0') - s = s.replace('[]', '0') - s = s.replace('+!![]', '10') - result += str(sum([int(i) for i in s.split('+')])) - return result - def _extract_all(self,txt, rules, pos=0, values=None): + def _extract_all(self, txt, rules, pos=0, values=None): """Calls extract for each rule and returns the result in a dict""" if values is None: values = {} @@ -189,22 +110,22 @@ class XMovies8IE(InfoExtractor): if key: values[key] = result return values, pos - def _extract(self,txt, begin, end, pos=0): + + def _extract(self, txt, begin, end, pos=0): """Extract the text between 'begin' and 'end' from 'txt' - + Args: txt: String to search in begin: First string to be searched for end: Second string to be searched for after 'begin' pos: Starting position for searches in 'txt' - + Returns: The string between the two search-strings 'begin' and 'end' beginning with position 'pos' in 'txt' as well as the position after 'end'. - If at least one of 'begin' or 'end' is not found, None and the original value of 'pos' is returned - + Examples: extract("abcde", "b", "d") -> "c" , 4 extract("abcde", "b", "d", 3) -> None, 3 @@ -212,36 +133,34 @@ class XMovies8IE(InfoExtractor): try: first = txt.index(begin, pos) + len(begin) last = txt.index(end, first) - return txt[first:last], last+len(end) + return txt[first:last], last + len(end) except ValueError: return None, pos - - def _solve_challenge(self, req,headers=None): + + def _solve_challenge(self, url, headers=None): try: self._request_webpage( - req, None, note='Solve Challenge',headers=headers) + url, None, note='Solving Challenge', headers=headers) except ExtractorError as ee: if not isinstance(ee.cause, compat_HTTPError) or ee.cause.code != 503: raise page = ee.cause.read().decode('utf-8') params = self._extract_all(page, ( ('jschl_vc', 'name="jschl_vc" value="', '"'), - ('pass' , 'name="pass" value="', '"'), + ('pass', 'name="pass" value="', '"'), ))[0] - params["jschl_answer"] = self._solve_jschl(req.full_url, page) + params["jschl_answer"] = self._solve_jschl(url, page) time.sleep(4) - print("params : ",params) - req = update_Request(req,urljoin(req.full_url,"/cdn-cgi/l/chk_jschl"),query=params) - self._request_webpage( - req, None, note='Downloading redirect page',headers=headers,fatal=False) - return req - # session.get(urllib.parse.urljoin(url, "/cdn-cgi/l/chk_jschl"), params=params) - # return session.cookies - def _solve_jschl(self,url, page): + # print("params : ", params) + rst = self._request_webpage( + urljoin(url, "/cdn-cgi/l/chk_jschl"), None, note='Downloading redirect page', headers=headers, fatal=False, query=params) + return rst + + def _solve_jschl(self, url, page): """Solve challenge to get 'jschl_answer' value""" data, pos = self._extract_all(page, ( - ('var' , ',f, ', '='), - ('key' , '"', '"'), + ('var', ',f, ', '='), + ('key', '"', '"'), ('expr', ':', '}') )) solution = evaluate_expression(data["expr"]) @@ -251,161 +170,74 @@ class XMovies8IE(InfoExtractor): for expr in expressions.split(";")[1:]: if expr.startswith(variable): func = operator_functions[expr[vlength]] - value = evaluate_expression(expr[vlength+2:]) + value = evaluate_expression(expr[vlength + 2:]) solution = func(solution, value) elif expr.startswith("a.value"): return solution + len(compat_urllib_parse_urlparse(url).netloc) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') isWatching = mobj.group('isWatching') - print("original :", url) - # url = compat_urlparse.urljoin(url, "/watching") if not isWatching else url - base_url = compat_urlparse.urljoin(url,"/") - print("base :", base_url) + # print("original :", url) + base_url = compat_urlparse.urljoin(url, "/") + # print("base :", base_url) parsed_url = compat_urllib_parse_urlparse(url) - print("after parsed:", parsed_url) + # print("after parsed:", parsed_url) headers = { 'User-Agent': self._USER_AGENT, - # 'Cookie':'__cfduid='+cfduid, - 'Referer':'http://'+parsed_url.netloc+'/', - # 'Host':parsed_url.netloc + 'Referer': 'http://' + parsed_url.netloc + '/', } - req = sanitized_Request(base_url) - self._solve_challenge(req,headers) + self._solve_challenge(base_url, headers) try: - - path = urljoin(parsed_url.path,"watching.html") if not isWatching else parsed_url.path - #print(path) - print(compat_urlparse.urljoin(base_url,path)) - webpage = self._download_webpage(compat_urlparse.urljoin(base_url,path), video_id, headers=headers) - # self.to_screen(webpage) - # title = self._html_search_regex(r'