Update youtube.py

2025-03-07 07:07:15 +08:00 · 2020-07-30 18:24:58 +05:30 · 2020-07-30 18:24:58 +05:30 · b179aa1496
commit b179aa1496
parent 5e79527881
1 changed files with 14 additions and 94 deletions
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -70,14 +70,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):

    _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'

-    _YOUTUBE_CLIENT_HEADERS = {
-        'x-youtube-client-name': '1',
-        'x-youtube-client-version': '1.20200609.04.02',
-    }
-
    def _set_language(self):
        self._set_cookie(
-            '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
+            '.youtube.com', 'PREF', 'f1=50000000&hl=en',
            # YouTube sets the expire time to about two months
            expire_time=time.time() + 2 * 30 * 24 * 3600)

@ -303,11 +298,10 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
                    # Downloading page may result in intermittent 5xx HTTP error
                    # that is usually worked around with a retry
                    more = self._download_json(
-                        'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
+                        'https://youtube.com/%s' % mobj.group('more'), playlist_id,
                        'Downloading page #%s%s'
                        % (page_num, ' (retry #%d)' % count if count else ''),
-                        transform_source=uppercase_escape,
-                        headers=self._YOUTUBE_CLIENT_HEADERS)
+                        transform_source=uppercase_escape)
                    break
                except ExtractorError as e:
                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
@ -1384,7 +1378,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        funcname = self._search_regex(
            (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
             r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
-             r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+             r'\b(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
             r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
             # Obsolete patterns
             r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
@ -1658,63 +1652,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        video_id = mobj.group(2)
        return video_id

-    def _extract_chapters_from_json(self, webpage, video_id, duration):
-        if not webpage:
-            return
-        player = self._parse_json(
-            self._search_regex(
-                r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,
-                'player args', default='{}'),
-            video_id, fatal=False)
-        if not player or not isinstance(player, dict):
-            return
-        watch_next_response = player.get('watch_next_response')
-        if not isinstance(watch_next_response, compat_str):
-            return
-        response = self._parse_json(watch_next_response, video_id, fatal=False)
-        if not response or not isinstance(response, dict):
-            return
-        chapters_list = try_get(
-            response,
-            lambda x: x['playerOverlays']
-                       ['playerOverlayRenderer']
-                       ['decoratedPlayerBarRenderer']
-                       ['decoratedPlayerBarRenderer']
-                       ['playerBar']
-                       ['chapteredPlayerBarRenderer']
-                       ['chapters'],
-            list)
-        if not chapters_list:
-            return
-
-        def chapter_time(chapter):
-            return float_or_none(
-                try_get(
-                    chapter,
-                    lambda x: x['chapterRenderer']['timeRangeStartMillis'],
-                    int),
-                scale=1000)
-        chapters = []
-        for next_num, chapter in enumerate(chapters_list, start=1):
-            start_time = chapter_time(chapter)
-            if start_time is None:
-                continue
-            end_time = (chapter_time(chapters_list[next_num])
-                        if next_num < len(chapters_list) else duration)
-            if end_time is None:
-                continue
-            title = try_get(
-                chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
-                compat_str)
-            chapters.append({
-                'start_time': start_time,
-                'end_time': end_time,
-                'title': title,
-            })
-        return chapters
-
    @staticmethod
-    def _extract_chapters_from_description(description, duration):
+    def _extract_chapters(description, duration):
        if not description:
            return None
        chapter_lines = re.findall(
@ -1748,10 +1687,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            })
        return chapters

-    def _extract_chapters(self, webpage, description, video_id, duration):
-        return (self._extract_chapters_from_json(webpage, video_id, duration)
-                or self._extract_chapters_from_description(description, duration))
-
    def _real_extract(self, url):
        url, smuggled_data = unsmuggle_url(url, {})

@ -1898,9 +1833,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        video_details = try_get(
            player_response, lambda x: x['videoDetails'], dict) or {}

-        microformat = try_get(
-            player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
-
        video_title = video_info.get('title', [None])[0] or video_details.get('title')
        if not video_title:
            self._downloader.report_warning('Unable to extract video title')
@ -1930,7 +1862,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            ''', replace_url, video_description)
            video_description = clean_html(video_description)
        else:
-            video_description = video_details.get('shortDescription') or self._html_search_meta('description', video_webpage)
+            video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription')

        if not smuggled_data.get('force_singlefeed', False):
            if not self._downloader.params.get('noplaylist'):
@ -1978,8 +1910,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            view_count = extract_view_count(video_info)
        if view_count is None and video_details:
            view_count = int_or_none(video_details.get('viewCount'))
-        if view_count is None and microformat:
-            view_count = int_or_none(microformat.get('viewCount'))

        if is_live is None:
            is_live = bool_or_none(video_details.get('isLive'))
@ -2262,8 +2192,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                [r'(?s)id="eow-date.*?>(.*?)</span>',
                 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
                video_webpage, 'upload date', default=None)
-        if not upload_date:
-            upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
        upload_date = unified_strdate(upload_date)

        video_license = self._html_search_regex(
@ -2335,21 +2263,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        m_cat_container = self._search_regex(
            r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
            video_webpage, 'categories', default=None)
-        category = None
        if m_cat_container:
            category = self._html_search_regex(
                r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
                default=None)
-        if not category:
-            category = try_get(
-                microformat, lambda x: x['category'], compat_str)
-        video_categories = None if category is None else [category]
+            video_categories = None if category is None else [category]
+        else:
+            video_categories = None

        video_tags = [
            unescapeHTML(m.group('content'))
            for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
-        if not video_tags:
-            video_tags = try_get(video_details, lambda x: x['keywords'], list)

        def _extract_count(count_name):
            return str_to_int(self._search_regex(
@ -2400,7 +2324,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    errnote='Unable to download video annotations', fatal=False,
                    data=urlencode_postdata({xsrf_field_name: xsrf_token}))

-        chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
+        chapters = self._extract_chapters(description_original, video_duration)

        # Look for the DASH manifest
        if self._downloader.params.get('youtube_include_dash_manifest', True):
@ -2755,7 +2679,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
        ids = []
        last_id = playlist_id[-11:]
        for n in itertools.count(1):
-            url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
+            url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
            webpage = self._download_webpage(
                url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
            new_ids = orderedSet(re.findall(
@ -3095,7 +3019,7 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor):

 class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
    IE_DESC = 'YouTube.com user/channel playlists'
-    _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
+    _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
    IE_NAME = 'youtube:playlists'

    _TESTS = [{
@ -3121,9 +3045,6 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
            'title': 'Chem Player',
        },
        'skip': 'Blocked',
-    }, {
-        'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
-        'only_matching': True,
    }]


@ -3268,10 +3189,9 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
                break

            more = self._download_json(
-                'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+                'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape,
-                headers=self._YOUTUBE_CLIENT_HEADERS)
+                transform_source=uppercase_escape)
            content_html = more['content_html']
            more_widget_html = more['load_more_widget_html']