[XMovies8IE] Add new extractor

2025-03-11 07:27:14 +08:00 · 2017-10-11 23:41:17 +08:00 · 2017-10-11 23:41:17 +08:00 · a998cb2f7c
commit a998cb2f7c
parent f9045dfb04
1 changed files with 90 additions and 258 deletions
--- a/youtube_dl/extractor/xmovies8.py
+++ b/youtube_dl/extractor/xmovies8.py
@ -1,53 +1,33 @@
 # coding: utf-8
 from __future__ import unicode_literals

-import re, time,operator
+import re
+import time
+import operator
+
 from .common import InfoExtractor
 from ..compat import (
    compat_HTTPError,
-    compat_str,
-    compat_urllib_parse_urlencode,
    compat_urllib_parse_urlparse,
 )
 from ..utils import (
-    clean_html,
-    urljoin,
    compat_urlparse,
-    ExtractorError,
-    sanitized_Request,
-    update_Request
+    ExtractorError
 )
+
+
 def urljoin(*args):
    """
    Joins given arguments into a url. Trailing but not leading slashes are
    stripped for each argument.
+
+    The urljoin in utils is not suitable for me.
+    I do not want to join url with the base url.
+    I only want to concat two paths without duplicate slashs
    """
    return "/".join(map(lambda x: str(x).rstrip('/'), args))
-def cookie_to_dict(cookie):
-    cookie_dict = {
-        'name': cookie.name,
-        'value': cookie.value,
-    }
-    if cookie.port_specified:
-        cookie_dict['port'] = cookie.port
-    if cookie.domain_specified:
-        cookie_dict['domain'] = cookie.domain
-    if cookie.path_specified:
-        cookie_dict['path'] = cookie.path
-    if cookie.expires is not None:
-        cookie_dict['expires'] = cookie.expires
-    if cookie.secure is not None:
-        cookie_dict['secure'] = cookie.secure
-    if cookie.discard is not None:
-        cookie_dict['discard'] = cookie.discard
-    try:
-        if (cookie.has_nonstandard_attr('httpOnly') or
-                cookie.has_nonstandard_attr('httponly') or
-                cookie.has_nonstandard_attr('HttpOnly')):
-            cookie_dict['httponly'] = True
-    except TypeError:
-        pass
-    return cookie_dict
+
+
 def evaluate_expression(expr):
    """Evaluate a Javascript expression for the challange and return its value"""
    stack = []
@ -55,7 +35,7 @@ def evaluate_expression(expr):
    value = ""
    for index, char in enumerate(expr):
        if char == "(":
-            stack.append(index+1)
+            stack.append(index + 1)
        elif char == ")":
            begin = stack.pop()
            if stack:
@ -66,19 +46,23 @@ def evaluate_expression(expr):
            num += expression_values[part]
        value += str(num)
    return int(value)
- 
+
+
 operator_functions = {
    "+": operator.add,
    "-": operator.sub,
    "*": operator.mul,
 }
- 
+
+
 expression_values = {
    "": 0,
    "+": 0,
    "!+": 1,
    "+!!": 1,
 }
+
+
 class XMovies8IE(InfoExtractor):
    _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
    _VALID_URL = r'''(?x)
@ -89,7 +73,7 @@ class XMovies8IE(InfoExtractor):
        '''
    _TEST = {
        'url': 'https://xmovies8.es/movie/the-hitman-s-bodyguard-2017.58852',
-       
+
        # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
        'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
        'info_dict': {
@ -99,88 +83,25 @@ class XMovies8IE(InfoExtractor):
            'description': "The world's top bodyguard gets a new client, a hit man who must testify at the International Court of Justice. They must put their differences aside and work together to make it to the trial on time.",
            'thumbnail': 'https://img.xmovies88.stream/crop/215/310/media/imagesv2/2017/08/the-hitman-s-bodyguard-2017-poster.jpg',
            'formats': [{
-                'format_id': '1287', 
-                'url': 'https://s4.ostr.tv/hls/qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa/index-v1-a1.m3u8', 
-                'manifest_url': 'https://s4.ostr.tv/hls/,qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa,.urlset/master.m3u8', 
-                'tbr': 1287.551, 
-                'ext': 'mp4', 
-                'fps': 23.974, 
-                'protocol': 'm3u8', 
-                'preference': None, 
-                'width': 1280, 
-                'height': 720, 
-                'vcodec': 'avc1.64001f', 
+                'format_id': '1287',
+                'url': 'https://s4.ostr.tv/hls/qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa/index-v1-a1.m3u8',
+                'manifest_url': 'https://s4.ostr.tv/hls/,qvsbfwjmnxblgwsztrb2a5mblc3lpikarb6xmlv774kcxkug6nhunwo5q6pa,.urlset/master.m3u8',
+                'tbr': 1287.551,
+                'ext': 'mp4',
+                'fps': 23.974,
+                'protocol': 'm3u8',
+                'preference': None,
+                'width': 1280,
+                'height': 720,
+                'vcodec': 'avc1.64001f',
                'acodec': 'mp4a.40.2'}]
        },
-        # 'info_dict': {
-        #     'id': '36164052',
-        #     'ext': 'flv',
-        #     'title': '데일리 에이프릴 요정들의 시상식!',
-        #     'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
-        #     'uploader': 'dailyapril',
-        #     'uploader_id': 'dailyapril',
-        #     'upload_date': '20160503',
-        # },
        'params': {
-            # m3u8 download
            'skip_download': True,
        }
    }
-    def _get_cv(self,ct, host_name):
-        #ct = ct.replace('\n', '').replace('\r', '')
-        #find all hidden form value
-        hidden = re.findall('<input type="hidden" name="([^"]+)" value="([^\"]+)"', ct)
-        hidden = '&'.join(map(lambda x:'='.join(x), hidden))
-        # get challange endpoint url
-        url = re.findall('<form id="[^"]+" action="([^"]+)" method="get">', ct)[0]
-        # get var name
-        # var t,r,a,f, kMuTlpA={"t":+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]))};
-        _, n, m, v = re.findall('var (:?[^,]+,)+ ([^=]+)={"([^"]+)":([^}]+)};', ct, re.DOTALL)[0]
-        v = self._calc_symbol(v)
-        # call eval() to calc expression
-        for op, arg in re.findall('%s\.%s(.)=([^;]+);' % (n, m), ct):
-            v = eval('%d %s %d' % (v, op, self._calc_symbol(arg)))
-        # t = re.findall('\+\s*([^\.]+)\.length', ct, re.DOTALL)[0]
-        # print '%s\.innerHTML\s*=\s*"([^"])";' % t
-        # new_len = len(re.findall('%s\.innerHTML\s*=\s*"([^"]+)";' % t, ct, re.DOTALL)[0])
-        # here we assume the meaning of t in defintely hostname, cf may change in the future
-        v += len(host_name)
-        # get wait time
-        wait = re.findall('}, (\d+)\);', ct, re.DOTALL)[0]
-        return hidden, v, url, wait
-    def _calc_symbol(self,s):
-        _ = re.findall('\+?\(\(([^\)]+)\)\+\(([^\)]+)\)\)', s)
-        #type 1 +((...)+(...)) 2-digit num
-        if _:
-            v1, v2 = map(self._calc_symbol, _[0])
-            return int(str(v1)+str(v2))
-        #type 2 plain
-        else:
-            # use look-up table to replace
-            vmap = {'!':1, '[]':0, '!![]':1, '':0}
-            return sum(map(lambda x:vmap[x], s.split('+')))
-    def _pycfl(self,s):
-        # !+[]  1
-        # !![]  1
-        # ![]   0
-        # []    0
-        result = ''
-        # print(s)  # DEBUG
-        ss = re.split('\(|\)', s)
-        for s in ss:
-            if s in ('+', ''):
-                continue
-            elif s[0] == '+':
-                s = s[1:]
-            s = s.replace('!+[]', '1')
-            s = s.replace('!![]', '1')
-            s = s.replace('![]', '0')
-            s = s.replace('[]', '0')
-            s = s.replace('+!![]', '10')
-            result += str(sum([int(i) for i in s.split('+')]))
-        return result

-    def _extract_all(self,txt, rules, pos=0, values=None):
+    def _extract_all(self, txt, rules, pos=0, values=None):
        """Calls extract for each rule and returns the result in a dict"""
        if values is None:
            values = {}
@ -189,22 +110,22 @@ class XMovies8IE(InfoExtractor):
            if key:
                values[key] = result
        return values, pos
-    def _extract(self,txt, begin, end, pos=0):
+
+    def _extract(self, txt, begin, end, pos=0):
        """Extract the text between 'begin' and 'end' from 'txt'
-    
+
        Args:
            txt: String to search in
            begin: First string to be searched for
            end: Second string to be searched for after 'begin'
            pos: Starting position for searches in 'txt'
-    
+
        Returns:
            The string between the two search-strings 'begin' and 'end' beginning
            with position 'pos' in 'txt' as well as the position after 'end'.
-    
            If at least one of 'begin' or 'end' is not found, None and the original
            value of 'pos' is returned
-    
+
        Examples:
            extract("abcde", "b", "d")    -> "c" , 4
            extract("abcde", "b", "d", 3) -> None, 3
@ -212,36 +133,34 @@ class XMovies8IE(InfoExtractor):
        try:
            first = txt.index(begin, pos) + len(begin)
            last = txt.index(end, first)
-            return txt[first:last], last+len(end)
+            return txt[first:last], last + len(end)
        except ValueError:
            return None, pos
-    
-    def _solve_challenge(self, req,headers=None):
+
+    def _solve_challenge(self, url, headers=None):
        try:
            self._request_webpage(
-                req, None, note='Solve Challenge',headers=headers)
+                url, None, note='Solving Challenge', headers=headers)
        except ExtractorError as ee:
            if not isinstance(ee.cause, compat_HTTPError) or ee.cause.code != 503:
                raise
            page = ee.cause.read().decode('utf-8')
            params = self._extract_all(page, (
                ('jschl_vc', 'name="jschl_vc" value="', '"'),
-                ('pass'    , 'name="pass" value="', '"'),
+                ('pass', 'name="pass" value="', '"'),
            ))[0]
-            params["jschl_answer"] = self._solve_jschl(req.full_url, page)
+            params["jschl_answer"] = self._solve_jschl(url, page)
            time.sleep(4)
-            print("params : ",params)
-            req = update_Request(req,urljoin(req.full_url,"/cdn-cgi/l/chk_jschl"),query=params)
-            self._request_webpage(
-                req, None, note='Downloading redirect page',headers=headers,fatal=False)
-            return req
-            # session.get(urllib.parse.urljoin(url, "/cdn-cgi/l/chk_jschl"), params=params)
-            # return session.cookies
-    def _solve_jschl(self,url, page):
+            # print("params : ", params)
+            rst = self._request_webpage(
+                urljoin(url, "/cdn-cgi/l/chk_jschl"), None, note='Downloading redirect page', headers=headers, fatal=False, query=params)
+            return rst
+
+    def _solve_jschl(self, url, page):
        """Solve challenge to get 'jschl_answer' value"""
        data, pos = self._extract_all(page, (
-            ('var' , ',f, ', '='),
-            ('key' , '"', '"'),
+            ('var', ',f, ', '='),
+            ('key', '"', '"'),
            ('expr', ':', '}')
        ))
        solution = evaluate_expression(data["expr"])
@ -251,161 +170,74 @@ class XMovies8IE(InfoExtractor):
        for expr in expressions.split(";")[1:]:
            if expr.startswith(variable):
                func = operator_functions[expr[vlength]]
-                value = evaluate_expression(expr[vlength+2:])
+                value = evaluate_expression(expr[vlength + 2:])
                solution = func(solution, value)
            elif expr.startswith("a.value"):
                return solution + len(compat_urllib_parse_urlparse(url).netloc)
+
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        isWatching = mobj.group('isWatching')

-        print("original :", url)
-        # url = compat_urlparse.urljoin(url, "/watching") if not isWatching else url
-        base_url = compat_urlparse.urljoin(url,"/")
-        print("base :", base_url)
+        # print("original :", url)
+        base_url = compat_urlparse.urljoin(url, "/")
+        # print("base :", base_url)
        parsed_url = compat_urllib_parse_urlparse(url)
-        print("after parsed:", parsed_url)
+        # print("after parsed:", parsed_url)
        headers = {
            'User-Agent': self._USER_AGENT,
-            # 'Cookie':'__cfduid='+cfduid,
-            'Referer':'http://'+parsed_url.netloc+'/',
-            # 'Host':parsed_url.netloc
+            'Referer': 'http://' + parsed_url.netloc + '/',
        }
-        req = sanitized_Request(base_url)
-        self._solve_challenge(req,headers)
+        self._solve_challenge(base_url, headers)
        try:
-            
-            path = urljoin(parsed_url.path,"watching.html") if not isWatching else parsed_url.path
-            #print(path)
-            print(compat_urlparse.urljoin(base_url,path))
-            webpage = self._download_webpage(compat_urlparse.urljoin(base_url,path), video_id, headers=headers)
-            # self.to_screen(webpage)
-            # title = self._html_search_regex(r'<div class="info_movie(?:\sfull.*)[^<]+class="full desc.*<h1>(.+)</h1>',webpage,'title', fatal=False)
-            # self.to_screen(webpage)
-            
-            title = self._html_search_regex(r'(?is)<meta[^>]+prop="name" content="([^"]+)',webpage,'title', fatal=False)
-            description = self._html_search_regex(r'(?is)<meta[^>]+prop="description" content="([^"]+)',webpage,'description', fatal=False)
-            duration = self._html_search_regex(r'(?is)<meta[^>]+prop="duration" content="([^"]+)',webpage,'duration', fatal=False)
-            thumbnailUrl = self._html_search_regex(r'(?is)<link[^>]+prop="thumbnailUrl" href="([^"]+)',webpage,'thumbnailUrl', fatal=False)
+            path = urljoin(parsed_url.path, "watching.html") if not isWatching else parsed_url.path
+            # print(compat_urlparse.urljoin(base_url, path))
+            webpage = self._download_webpage(compat_urlparse.urljoin(base_url, path), video_id, headers=headers)
+            title = self._html_search_regex(r'(?is)<meta[^>]+prop="name" content="([^"]+)', webpage, 'title', fatal=False)
+            description = self._html_search_regex(r'(?is)<meta[^>]+prop="description" content="([^"]+)', webpage, 'description', fatal=False)
+            # duration = self._html_search_regex(r'(?is)<meta[^>]+prop="duration" content="([^"]+)', webpage, 'duration', fatal=False)
+            thumbnailUrl = self._html_search_regex(r'(?is)<link[^>]+prop="thumbnailUrl" href="([^"]+)', webpage, 'thumbnailUrl', fatal=False)

-            player_id = self._html_search_regex(r'[^}]+else[^{]+{.*load_player\(\'(\d+)\'[^\)]*',webpage,'player_id', fatal=False)
-            movie_id = self._html_search_regex(r'<script[^>]+/javascript\"> var movie = { id: (\d+),',webpage,'movie_id', fatal=False)
+            player_id = self._html_search_regex(r'[^}]+else[^{]+{.*load_player\(\'(\d+)\'[^\)]*', webpage, 'player_id', fatal=False)
+            movie_id = self._html_search_regex(r'<script[^>]+/javascript\"> var movie = { id: (\d+),', webpage, 'movie_id', fatal=False)

-            print(compat_urlparse.urljoin(base_url,"/ajax/movie/load_player_v3"))
-            load_player_v3 = self._download_json(compat_urlparse.urljoin(base_url,"/ajax/movie/load_player_v3"),video_id,headers=headers,query={'id':player_id})
+            # print(compat_urlparse.urljoin(base_url, "/ajax/movie/load_player_v3"))
+            load_player_v3 = self._download_json(compat_urlparse.urljoin(base_url, "/ajax/movie/load_player_v3"), video_id, note="Downloading player v3", headers=headers, query={'id': player_id})

-            print(title)
-            print(player_id)
-            print(load_player_v3)
-            print(load_player_v3.get('value'))
+            # print(title)
+            # print(player_id)
+            # print(load_player_v3)
+            # print(load_player_v3.get('value'))

-            playlist = self._download_json(parsed_url.scheme+":"+load_player_v3.get('value'),video_id,headers=headers)
-            print(playlist)
+            playlist = self._download_json(parsed_url.scheme + ":" + load_player_v3.get('value'), video_id, note="Downloading video format", headers=headers)
+            # print(playlist)
            formats = None
            for play in playlist.get('playlist'):
-                print(play.get('file'))
+                # print(play.get('file'))
                # m3u8_formats = self._extract_m3u8_formats(play.get('file'),video_id)
-                formats = self._extract_m3u8_formats(play.get('file'),video_id,"mp4")
-                print(formats)
-            if not formats and error:
-                raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+                formats = self._extract_m3u8_formats(play.get('file'), video_id, "mp4")
+                # print(formats)
+
            self._sort_formats(formats)
-            print({
-                'id': movie_id,
-                'title': title,
-                'ext':formats[0].get('ext'),
-                'description': description,
-                'thumbnail': thumbnailUrl,
-                'formats': formats
-            })
+            # print({
+            #     'id': movie_id,
+            #     'title': title,
+            #     'ext': formats[0].get('ext'),
+            #     'description': description,
+            #     'thumbnail': thumbnailUrl,
+            #     'formats': formats
+            # })
            return {
                'id': movie_id,
                'title': title,
-                'ext':formats[0].get('ext'),
+                'ext': formats[0].get('ext'),
                'description': description,
                'thumbnail': thumbnailUrl,
                'formats': formats
            }
        except ExtractorError as ee:
-            print("OOOOOO")
-            print(ee)
            if not isinstance(ee.cause, compat_HTTPError) or \
               ee.cause.code != 503:
                self.to_screen(ee.cause.read().decode('utf-8'))
                raise
-            redir_webpage = ee.cause.read().decode('utf-8')
-            cfduid = self._get_cookies(parsed_url.netloc).get('__cfduid').value
-            self._set_cookie(parsed_url.netloc,'__cfduid',cfduid)
-            
-            c, v, u, w = self._get_cv(redir_webpage, parsed_url.netloc)
-            print(c,v,u,w)
-            # action = self._search_regex(
-            #     r'<form id="challenge-form" action="([^"]+)"',
-            #     redir_webpage, 'Redirect form')
-            # vc = self._search_regex(
-            #     r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
-            #     redir_webpage, 'redirect vc value')
-            # pwd = self._search_regex(
-            #     r'<input type="hidden" name="pass" value="([^"]+)"/>',
-            #     redir_webpage, 'redirect pass value')
-            # av = re.search(
-            #     r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
-            #     redir_webpage)
-            # init = re.search(
-            #     r'''
-            #     (?sx)setTimeout\((?:.)*var\s+(?:[a-z],)*\s+(?P<dict>[a-zA-Z]*)={\"(?P<key>[a-zA-Z]*)\":(?P<init>[\(\)!\[\]\+]*)
-            #     '''
-            # ,redir_webpage)
-
-            # ans = int(self._pycfl(init.group('init')))
-            # for content in re.finditer(r''+init.group('dict')+'\.'+init.group('key')+'(?P<oper>[+\-\*/])=(?P<val>[\(\)!\[\]\+]*);',redir_webpage):
-            #     if '*' == content.group('oper'):
-            #         ans *= int(self._pycfl(content.group('val')))
-            #     elif '+' == content.group('oper'):
-            #         ans += int(self._pycfl(content.group('val')))
-            #     elif '-' == content.group('oper'):
-            #         ans -= int(self._pycfl(content.group('val')))
-            #     elif '/' == content.group('oper'):
-            #         ans /= int(self._pycfl(content.group('val')))
-            
-            # ans += len(parsed_url.netloc)
-            # confirm_url = (
-            #     parsed_url.scheme + '://' + parsed_url.netloc +
-            #     action + '?' +
-            #     compat_urllib_parse_urlencode({
-            #         'jschl_vc': vc,
-            #         # 'pass': pwd,
-            #         'jschl_answer': compat_str(ans)                  
-            #         })
-            # )
-            try:
-                time.sleep(int(w)//1000)
-                urlh = self._request_webpage(
-                    req, None, note='Downloading redirect page',headers=headers,fatal=False)
-                # print('%s://%s%s?%s&jschl_answer=%s' % (parsed_url.scheme, parsed_url.netloc,u, c, v))
-                # print(confirm_url)
-                
-                # webpage, url_handle = self._download_webpage_handle(
-                    # confirm_url, None, 'Downloading login page',headers=headers)
-                # webpage = self._download_webpage(
-                #     confirm_url, video_id,
-                #     note='Confirming after redirect',
-                #     headers=headers)
-
-                self.to_screen(webpage)
-                # title = self._html_search_regex(r'<div class="info_movie(?:\sfull)?"[^>]+<div class="tit full"><h1>(.+?)</h1>', webpage, 'title', fatal=False)
-                # print(title)
-                return {
-                    'id': video_id,
-                    # 'title': title,
-                    'description': self._og_search_description(webpage),
-                    # 'uploader': self._search_regex(r'<div[^>]+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False),
-                    # TODO more properties (see youtube_dl/extractor/common.py)
-                }
-            except ExtractorError as ee:
-                if not isinstance(ee.cause, compat_HTTPError) or \
-                    ee.cause.code != 503:
-                    raise
-                webpage = ee.cause.read().decode('utf-8')
-