Merge branch 'ytdl-org-master'

2026-05-26 23:59:48 +08:00 · 2020-04-12 12:59:53 +03:00
parent a0e27a11b8 f7363e2fb3
commit f1604aa805
10 changed files with 183 additions and 59 deletions
@@ -2340,6 +2340,8 @@ class InfoExtractor(object):
        if res is False:
            return []
        ism_doc, urlh = res
+        if ism_doc is None:
+            return []

        return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)

@@ -77,4 +77,3 @@ class MofosexEmbedIE(InfoExtractor):
        return self.url_result(
            'http://www.mofosex.com/videos/{0}/{0}.html'.format(video_id),
            ie=MofosexIE.ie_key(), video_id=video_id)
-
@@ -26,7 +26,7 @@ class MotherlessIE(InfoExtractor):
            'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
            'upload_date': '20100913',
            'uploader_id': 'famouslyfuckedup',
-            'thumbnail': r're:http://.*\.jpg',
+            'thumbnail': r're:https?://.*\.jpg',
            'age_limit': 18,
        }
    }, {
@@ -40,7 +40,7 @@ class MotherlessIE(InfoExtractor):
                           'game', 'hairy'],
            'upload_date': '20140622',
            'uploader_id': 'Sulivana7x',
-            'thumbnail': r're:http://.*\.jpg',
+            'thumbnail': r're:https?://.*\.jpg',
            'age_limit': 18,
        },
        'skip': '404',
@@ -54,7 +54,7 @@ class MotherlessIE(InfoExtractor):
            'categories': ['superheroine heroine  superher'],
            'upload_date': '20140827',
            'uploader_id': 'shade0230',
-            'thumbnail': r're:http://.*\.jpg',
+            'thumbnail': r're:https?://.*\.jpg',
            'age_limit': 18,
        }
    }, {
@@ -76,7 +76,8 @@ class MotherlessIE(InfoExtractor):
            raise ExtractorError('Video %s is for friends only' % video_id, expected=True)

        title = self._html_search_regex(
-            r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
+            (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>',
+             r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title')
        video_url = (self._html_search_regex(
            (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
             r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'),
@@ -84,14 +85,15 @@ class MotherlessIE(InfoExtractor):
            or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
        age_limit = self._rta_search(webpage)
        view_count = str_to_int(self._html_search_regex(
-            r'<strong>Views</strong>\s+([^<]+)<',
+            (r'>(\d+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
            webpage, 'view count', fatal=False))
        like_count = str_to_int(self._html_search_regex(
-            r'<strong>Favorited</strong>\s+([^<]+)<',
+            (r'>(\d+)\s+Favorites<', r'<strong>Favorited</strong>\s+([^<]+)<'),
            webpage, 'like count', fatal=False))

        upload_date = self._html_search_regex(
-            r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload date')
+            (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<',
+             r'<strong>Uploaded</strong>\s+([^<]+)<'), webpage, 'upload date')
        if 'Ago' in upload_date:
            days = int(re.search(r'([0-9]+)', upload_date).group(1))
            upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d')
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
    clean_html,
+    determine_ext,
    int_or_none,
    js_to_json,
    qualities,
@@ -33,42 +34,76 @@ class NovaEmbedIE(InfoExtractor):

        webpage = self._download_webpage(url, video_id)

-        bitrates = self._parse_json(
-            self._search_regex(
-                r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'),
-            video_id, transform_source=js_to_json)
-
-        QUALITIES = ('lq', 'mq', 'hq', 'hd')
-        quality_key = qualities(QUALITIES)
-
+        duration = None
        formats = []
-        for format_id, format_list in bitrates.items():
-            if not isinstance(format_list, list):
-                format_list = [format_list]
-            for format_url in format_list:
-                format_url = url_or_none(format_url)
-                if not format_url:
-                    continue
-                if format_id == 'hls':
-                    formats.extend(self._extract_m3u8_formats(
-                        format_url, video_id, ext='mp4',
-                        entry_protocol='m3u8_native', m3u8_id='hls',
-                        fatal=False))
-                    continue
-                f = {
-                    'url': format_url,
-                }
-                f_id = format_id
-                for quality in QUALITIES:
-                    if '%s.mp4' % quality in format_url:
-                        f_id += '-%s' % quality
-                        f.update({
-                            'quality': quality_key(quality),
-                            'format_note': quality.upper(),
+
+        player = self._parse_json(
+            self._search_regex(
+                r'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;',
+                webpage, 'player', default='{}'), video_id, fatal=False)
+        if player:
+            for format_id, format_list in player['tracks'].items():
+                if not isinstance(format_list, list):
+                    format_list = [format_list]
+                for format_dict in format_list:
+                    if not isinstance(format_dict, dict):
+                        continue
+                    format_url = url_or_none(format_dict.get('src'))
+                    format_type = format_dict.get('type')
+                    ext = determine_ext(format_url)
+                    if (format_type == 'application/x-mpegURL'
+                            or format_id == 'HLS' or ext == 'm3u8'):
+                        formats.extend(self._extract_m3u8_formats(
+                            format_url, video_id, 'mp4',
+                            entry_protocol='m3u8_native', m3u8_id='hls',
+                            fatal=False))
+                    elif (format_type == 'application/dash+xml'
+                          or format_id == 'DASH' or ext == 'mpd'):
+                        formats.extend(self._extract_mpd_formats(
+                            format_url, video_id, mpd_id='dash', fatal=False))
+                    else:
+                        formats.append({
+                            'url': format_url,
                        })
-                        break
-                f['format_id'] = f_id
-                formats.append(f)
+            duration = int_or_none(player.get('duration'))
+        else:
+            # Old path, not actual as of 08.04.2020
+            bitrates = self._parse_json(
+                self._search_regex(
+                    r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'),
+                video_id, transform_source=js_to_json)
+
+            QUALITIES = ('lq', 'mq', 'hq', 'hd')
+            quality_key = qualities(QUALITIES)
+
+            for format_id, format_list in bitrates.items():
+                if not isinstance(format_list, list):
+                    format_list = [format_list]
+                for format_url in format_list:
+                    format_url = url_or_none(format_url)
+                    if not format_url:
+                        continue
+                    if format_id == 'hls':
+                        formats.extend(self._extract_m3u8_formats(
+                            format_url, video_id, ext='mp4',
+                            entry_protocol='m3u8_native', m3u8_id='hls',
+                            fatal=False))
+                        continue
+                    f = {
+                        'url': format_url,
+                    }
+                    f_id = format_id
+                    for quality in QUALITIES:
+                        if '%s.mp4' % quality in format_url:
+                            f_id += '-%s' % quality
+                            f.update({
+                                'quality': quality_key(quality),
+                                'format_note': quality.upper(),
+                            })
+                            break
+                    f['format_id'] = f_id
+                    formats.append(f)
+
        self._sort_formats(formats)

        title = self._og_search_title(
@@ -81,7 +116,8 @@ class NovaEmbedIE(InfoExtractor):
            r'poster\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
            'thumbnail', fatal=False, group='value')
        duration = int_or_none(self._search_regex(
-            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+            r'videoDuration\s*:\s*(\d+)', webpage, 'duration',
+            default=duration))

        return {
            'id': video_id,
@@ -246,7 +246,12 @@ class SoundcloudIE(InfoExtractor):
                'comment_count': int,
                'repost_count': int,
            },
-        }
+        },
+        {
+            # with AAC HQ format available via OAuth token
+            'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1',
+            'only_matching': True,
+        },
    ]

    _API_V2_BASE = 'https://api-v2.soundcloud.com/'
@@ -350,6 +355,9 @@ class SoundcloudIE(InfoExtractor):
            format_id_list = []
            if protocol:
                format_id_list.append(protocol)
+            ext = f.get('ext')
+            if ext == 'aac':
+                f['abr'] = '256'
            for k in ('ext', 'abr'):
                v = f.get(k)
                if v:
@@ -360,9 +368,13 @@ class SoundcloudIE(InfoExtractor):
            abr = f.get('abr')
            if abr:
                f['abr'] = int(abr)
+            if protocol == 'hls':
+                protocol = 'm3u8' if ext == 'aac' else 'm3u8_native'
+            else:
+                protocol = 'http'
            f.update({
                'format_id': '_'.join(format_id_list),
-                'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+                'protocol': protocol,
                'preference': -10 if preview else None,
            })
            formats.append(f)
@@ -1,9 +1,19 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import re
+
 from .common import InfoExtractor
+from .jwplatform import JWPlatformIE
 from .nexx import NexxIE
-from ..compat import compat_urlparse
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    NO_DEFAULT,
+    try_get,
+)


 class Tele5IE(InfoExtractor):
@@ -44,14 +54,49 @@ class Tele5IE(InfoExtractor):
        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
        video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0]

-        if not video_id:
+        NEXX_ID_RE = r'\d{6,}'
+        JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}'
+
+        def nexx_result(nexx_id):
+            return self.url_result(
+                'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id,
+                ie=NexxIE.ie_key(), video_id=nexx_id)
+
+        nexx_id = jwplatform_id = None
+
+        if video_id:
+            if re.match(NEXX_ID_RE, video_id):
+                return nexx_result(video_id)
+            elif re.match(JWPLATFORM_ID_RE, video_id):
+                jwplatform_id = video_id
+
+        if not nexx_id:
            display_id = self._match_id(url)
            webpage = self._download_webpage(url, display_id)
-            video_id = self._html_search_regex(
-                (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)',
-                 r'\s+id\s*=\s*["\']player_(\d{6,})',
-                 r'\bdata-id\s*=\s*["\'](\d{6,})'), webpage, 'video id')
+
+            def extract_id(pattern, name, default=NO_DEFAULT):
+                return self._html_search_regex(
+                    (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern,
+                     r'\s+id\s*=\s*["\']player_(%s)' % pattern,
+                     r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name,
+                    default=default)
+
+            nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None)
+            if nexx_id:
+                return nexx_result(nexx_id)
+
+            if not jwplatform_id:
+                jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id')
+
+            media = self._download_json(
+                'https://cdn.jwplayer.com/v2/media/' + jwplatform_id,
+                display_id)
+            nexx_id = try_get(
+                media, lambda x: x['playlist'][0]['nexx_id'], compat_str)
+
+            if nexx_id:
+                return nexx_result(nexx_id)

        return self.url_result(
-            'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id,
-            ie=NexxIE.ie_key(), video_id=video_id)
+            'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(),
+            video_id=jwplatform_id)
@@ -31,6 +31,10 @@ class ThisOldHouseIE(InfoExtractor):
    }, {
        'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost',
        'only_matching': True,
+    }, {
+        # iframe www.thisoldhouse.com
+        'url': 'https://www.thisoldhouse.com/21083431/seaside-transformation-the-westerly-project',
+        'only_matching': True,
    }]
    _ZYPE_TMPL = 'https://player.zype.com/embed/%s.html?api_key=hsOk_yMSPYNrT22e9pu8hihLXjaZf0JW5jsOWv4ZqyHJFvkJn6rtToHl09tbbsbe'

@@ -38,6 +42,6 @@ class ThisOldHouseIE(InfoExtractor):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)
        video_id = self._search_regex(
-            r'<iframe[^>]+src=[\'"](?:https?:)?//thisoldhouse\.chorus\.build/videos/zype/([0-9a-f]{24})',
+            r'<iframe[^>]+src=[\'"](?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})',
            webpage, 'video id')
        return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id)
@@ -99,7 +99,7 @@ class TV4IE(InfoExtractor):
            manifest_url.replace('.m3u8', '.f4m'),
            video_id, f4m_id='hds', fatal=False))
        formats.extend(self._extract_ism_formats(
-            re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url),
+            re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url),
            video_id, ism_id='mss', fatal=False))

        if not formats and info.get('is_geo_restricted'):
@@ -655,7 +655,14 @@ class TwitchStreamIE(TwitchBaseIE):

 class TwitchClipsIE(TwitchBaseIE):
    IE_NAME = 'twitch:clips'
-    _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|
+                            (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/
+                        )
+                        (?P<id>[^/?#&]+)
+                    '''

    _TESTS = [{
        'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat',
@@ -681,6 +688,12 @@ class TwitchClipsIE(TwitchBaseIE):
    }, {
        'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited',
        'only_matching': True,
+    }, {
+        'url': 'https://m.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank',
+        'only_matching': True,
+    }, {
+        'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank',
+        'only_matching': True,
    }]

    def _real_extract(self, url):
@@ -1845,15 +1845,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                        # fields may contain comma as well (see
                        # https://github.com/ytdl-org/youtube-dl/issues/8536)
                        feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
+
+                        def feed_entry(name):
+                            return try_get(feed_data, lambda x: x[name][0], compat_str)
+
+                        feed_id = feed_entry('id')
+                        if not feed_id:
+                            continue
+                        feed_title = feed_entry('title')
+                        title = video_title
+                        if feed_title:
+                            title += ' (%s)' % feed_title
                        entries.append({
                            '_type': 'url_transparent',
                            'ie_key': 'Youtube',
                            'url': smuggle_url(
                                '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
                                {'force_singlefeed': True}),
-                            'title': '%s (%s)' % (video_title, feed_data['title'][0]),
+                            'title': title,
                        })
-                        feed_ids.append(feed_data['id'][0])
+                        feed_ids.append(feed_id)
                    self.to_screen(
                        'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
                        % (', '.join(feed_ids), video_id))
@@ -1924,7 +1935,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                }

            for fmt in streaming_formats:
-                if fmt.get('drm_families'):
+                if fmt.get('drmFamilies') or fmt.get('drm_families'):
                    continue
                url = url_or_none(fmt.get('url'))