Merge branch 'master' into fix.25.12.2018

2025-03-10 23:37:18 +08:00 · 2019-01-22 12:46:24 +02:00 · 2019-01-22 12:46:24 +02:00 · c277785011
commit c277785011
parent 61ee81c06b 19d6991312
18 changed files with 612 additions and 90 deletions
--- a/README.md
+++ b/README.md
@ -667,7 +667,7 @@ The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, `
 - `asr`: Audio sampling rate in Hertz
 - `fps`: Frame rate

-Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begins with), `$=` (ends with), `*=` (contains) and following string meta fields:
+Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends with), `*=` (contains) and following string meta fields:
 - `ext`: File extension
 - `acodec`: Name of the audio codec in use
 - `vcodec`: Name of the video codec in use
@ -675,6 +675,8 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin
 - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`)
 - `format_id`: A short description of the format

+Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain).
+
 Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster.

 Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s.
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@ -497,7 +497,64 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
                    'width': 1280,
                    'height': 720,
                }]
-            )
+            ),
+            (
+                # https://github.com/rg3/youtube-dl/issues/18923
+                # https://www.ted.com/talks/boris_hesser_a_grassroots_healthcare_revolution_in_africa
+                'ted_18923',
+                'http://hls.ted.com/talks/31241.m3u8',
+                [{
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '600k-Audio',
+                    'vcodec': 'none',
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '68',
+                    'vcodec': 'none',
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '163',
+                    'acodec': 'none',
+                    'width': 320,
+                    'height': 180,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '481',
+                    'acodec': 'none',
+                    'width': 512,
+                    'height': 288,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '769',
+                    'acodec': 'none',
+                    'width': 512,
+                    'height': 288,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '984',
+                    'acodec': 'none',
+                    'width': 512,
+                    'height': 288,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '1255',
+                    'acodec': 'none',
+                    'width': 640,
+                    'height': 360,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '1693',
+                    'acodec': 'none',
+                    'width': 853,
+                    'height': 480,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '2462',
+                    'acodec': 'none',
+                    'width': 1280,
+                    'height': 720,
+                }]
+            ),
        ]

        for m3u8_file, m3u8_url, expected_formats in _TEST_CASES:
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@ -239,6 +239,52 @@ class TestFormatSelection(unittest.TestCase):
        downloaded = ydl.downloaded_info_dicts[0]
        self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot')

+    def test_format_selection_string_ops(self):
+        formats = [
+            {'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL},
+        ]
+        info_dict = _make_result(formats)
+
+        # equals (=)
+        ydl = YDL({'format': '[format_id=abc-cba]'})
+        ydl.process_ie_result(info_dict.copy())
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'abc-cba')
+
+        # does not equal (!=)
+        ydl = YDL({'format': '[format_id!=abc-cba]'})
+        self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
+
+        # starts with (^=)
+        ydl = YDL({'format': '[format_id^=abc]'})
+        ydl.process_ie_result(info_dict.copy())
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'abc-cba')
+
+        # does not start with (!^=)
+        ydl = YDL({'format': '[format_id!^=abc-cba]'})
+        self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
+
+        # ends with ($=)
+        ydl = YDL({'format': '[format_id$=cba]'})
+        ydl.process_ie_result(info_dict.copy())
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'abc-cba')
+
+        # does not end with (!$=)
+        ydl = YDL({'format': '[format_id!$=abc-cba]'})
+        self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
+
+        # contains (*=)
+        ydl = YDL({'format': '[format_id*=-]'})
+        ydl.process_ie_result(info_dict.copy())
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'abc-cba')
+
+        # does not contain (!*=)
+        ydl = YDL({'format': '[format_id!*=-]'})
+        self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
+
    def test_youtube_format_selection(self):
        order = [
            '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13',
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -507,6 +507,8 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(urljoin('http://foo.de/', ''), None)
        self.assertEqual(urljoin('http://foo.de/', ['foobar']), None)
        self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt')
+        self.assertEqual(urljoin('http://foo.de/a/b/c.txt', 'rtmp://foo.de'), 'rtmp://foo.de')
+        self.assertEqual(urljoin(None, 'rtmp://foo.de'), 'rtmp://foo.de')

    def test_url_or_none(self):
        self.assertEqual(url_or_none(None), None)
--- a/test/testdata/m3u8/ted_18923.m3u8
+++ b/test/testdata/m3u8/ted_18923.m3u8
@ -0,0 +1,28 @@
+#EXTM3U
+#EXT-X-VERSION:4
+#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1255659,PROGRAM-ID=1,CODECS="avc1.42c01e,mp4a.40.2",RESOLUTION=640x360
+/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b
+#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=163154,PROGRAM-ID=1,CODECS="avc1.42c00c,mp4a.40.2",RESOLUTION=320x180
+/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b
+#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=481701,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288
+/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b
+#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=769968,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288
+/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b
+#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=984037,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288
+/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b
+#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1693925,PROGRAM-ID=1,CODECS="avc1.4d401f,mp4a.40.2",RESOLUTION=853x480
+/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b
+#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=2462469,PROGRAM-ID=1,CODECS="avc1.640028,mp4a.40.2",RESOLUTION=1280x720
+/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b
+#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=68101,PROGRAM-ID=1,CODECS="mp4a.40.2",DEFAULT=YES
+/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b
+
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=74298,PROGRAM-ID=1,CODECS="avc1.42c00c",RESOLUTION=320x180,URI="/videos/BorisHesser_2018S/video/64k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=216200,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/180k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=304717,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/320k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=350933,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/450k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=495850,PROGRAM-ID=1,CODECS="avc1.42c01e",RESOLUTION=640x360,URI="/videos/BorisHesser_2018S/video/600k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=810750,PROGRAM-ID=1,CODECS="avc1.4d401f",RESOLUTION=853x480,URI="/videos/BorisHesser_2018S/video/950k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
+#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=1273700,PROGRAM-ID=1,CODECS="avc1.640028",RESOLUTION=1280x720,URI="/videos/BorisHesser_2018S/video/1500k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b"
+
+#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="600k",LANGUAGE="en",NAME="Audio",AUTOSELECT=YES,DEFAULT=YES,URI="/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b",BANDWIDTH=614400
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -1063,21 +1063,24 @@ class YoutubeDL(object):
        if not m:
            STR_OPERATORS = {
                '=': operator.eq,
-                '!=': operator.ne,
                '^=': lambda attr, value: attr.startswith(value),
                '$=': lambda attr, value: attr.endswith(value),
                '*=': lambda attr, value: value in attr,
            }
            str_operator_rex = re.compile(r'''(?x)
                \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
-                \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
+                \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?
                \s*(?P<value>[a-zA-Z0-9._-]+)
                \s*$
                ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
            m = str_operator_rex.search(filter_spec)
            if m:
                comparison_value = m.group('value')
-                op = STR_OPERATORS[m.group('op')]
+                str_op = STR_OPERATORS[m.group('op')]
+                if m.group('negation'):
+                    op = lambda attr, value: not str_op
+                else:
+                    op = str_op

        if not m:
            raise ValueError('Invalid filter specification %r' % filter_spec)
--- a/youtube_dl/extractor/cartoonnetwork.py
+++ b/youtube_dl/extractor/cartoonnetwork.py
@ -1,20 +1,19 @@
 # coding: utf-8
 from __future__ import unicode_literals

-import re
-
 from .turner import TurnerBaseIE
+from ..utils import int_or_none


 class CartoonNetworkIE(TurnerBaseIE):
    _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html'
    _TEST = {
-        'url': 'http://www.cartoonnetwork.com/video/teen-titans-go/starfire-the-cat-lady-clip.html',
+        'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html',
        'info_dict': {
-            'id': '8a250ab04ed07e6c014ef3f1e2f9016c',
+            'id': '6e3375097f63874ebccec7ef677c1c3845fa850e',
            'ext': 'mp4',
-            'title': 'Starfire the Cat Lady',
-            'description': 'Robin decides to become a cat so that Starfire will finally love him.',
+            'title': 'How to Draw Upgrade',
+            'description': 'md5:2061d83776db7e8be4879684eefe8c0f',
        },
        'params': {
            # m3u8 download
@ -25,18 +24,39 @@ class CartoonNetworkIE(TurnerBaseIE):
    def _real_extract(self, url):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)
-        id_type, video_id = re.search(r"_cnglobal\.cvp(Video|Title)Id\s*=\s*'([^']+)';", webpage).groups()
-        query = ('id' if id_type == 'Video' else 'titleId') + '=' + video_id
-        return self._extract_cvp_info(
-            'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, {
-                'secure': {
-                    'media_src': 'http://androidhls-secure.cdn.turner.com/toon/big',
-                    'tokenizer_src': 'https://token.vgtf.net/token/token_mobile',
-                },
-            }, {
+
+        def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False):
+            metadata_re = ''
+            if content_re:
+                metadata_re = r'|video_metadata\.content_' + content_re
+            return self._search_regex(
+                r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re),
+                webpage, name, fatal=fatal)
+
+        media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True)
+        title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True)
+
+        info = self._extract_ngtv_info(
+            media_id, {'networkId': 'cartoonnetwork'}, {
                'url': url,
                'site_name': 'CartoonNetwork',
-                'auth_required': self._search_regex(
-                    r'_cnglobal\.cvpFullOrPreviewAuth\s*=\s*(true|false);',
-                    webpage, 'auth required', default='false') == 'true',
+                'auth_required': find_field('authType', 'auth type') != 'unauth',
            })
+
+        series = find_field(
+            'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage)
+        info.update({
+            'id': media_id,
+            'display_id': display_id,
+            'title': title,
+            'description': self._html_search_meta('description', webpage),
+            'series': series,
+            'episode': title,
+        })
+
+        for field in ('season', 'episode'):
+            field_name = field + 'Number'
+            info[field + '_number'] = int_or_none(find_field(
+                field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage))
+
+        return info
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1596,6 +1596,7 @@ class InfoExtractor(object):
        # References:
        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
        # 2. https://github.com/rg3/youtube-dl/issues/12211
+        # 3. https://github.com/rg3/youtube-dl/issues/18923

        # We should try extracting formats only from master playlists [1, 4.3.4],
        # i.e. playlists that describe available qualities. On the other hand
@ -1667,11 +1668,16 @@ class InfoExtractor(object):
            rendition = stream_group[0]
            return rendition.get('NAME') or stream_group_id

+        # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
+        # chance to detect video only formats when EXT-X-STREAM-INF tags
+        # precede EXT-X-MEDIA tags in HLS manifest such as [3].
+        for line in m3u8_doc.splitlines():
+            if line.startswith('#EXT-X-MEDIA:'):
+                extract_media(line)
+
        for line in m3u8_doc.splitlines():
            if line.startswith('#EXT-X-STREAM-INF:'):
                last_stream_inf = parse_m3u8_attributes(line)
-            elif line.startswith('#EXT-X-MEDIA:'):
-                extract_media(line)
            elif line.startswith('#') or not line.strip():
                continue
            else:
@ -2624,7 +2630,7 @@ class InfoExtractor(object):
                'id': this_video_id,
                'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
                'description': video_data.get('description'),
-                'thumbnail': self._proto_relative_url(video_data.get('image')),
+                'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
                'timestamp': int_or_none(video_data.get('pubdate')),
                'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
                'subtitles': subtitles,
@ -2651,12 +2657,9 @@ class InfoExtractor(object):
        for source in jwplayer_sources_data:
            if not isinstance(source, dict):
                continue
-            source_url = self._proto_relative_url(source.get('file'))
-            if not source_url:
-                continue
-            if base_url:
-                source_url = compat_urlparse.urljoin(base_url, source_url)
-            if source_url in urls:
+            source_url = urljoin(
+                base_url, self._proto_relative_url(source.get('file')))
+            if not source_url or source_url in urls:
                continue
            urls.append(source_url)
            source_type = source.get('type') or ''
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -452,6 +452,7 @@ from .hellporno import HellPornoIE
 from .helsinki import HelsinkiIE
 from .hentaistigma import HentaiStigmaIE
 from .hgtv import HGTVComShowIE
+from .hketv import HKETVIE
 from .hidive import HiDiveIE
 from .historicfilms import HistoricFilmsIE
 from .hitbox import HitboxIE, HitboxLiveIE
@ -494,7 +495,11 @@ from .ina import InaIE
 from .inc import IncIE
 from .indavideo import IndavideoEmbedIE
 from .infoq import InfoQIE
-from .instagram import InstagramIE, InstagramUserIE
+from .instagram import (
+    InstagramIE,
+    InstagramUserIE,
+    InstagramTagIE,
+)
 from .internazionale import InternazionaleIE
 from .internetvideoarchive import InternetVideoArchiveIE
 from .iprima import IPrimaIE
--- a/youtube_dl/extractor/hketv.py
+++ b/youtube_dl/extractor/hketv.py
@ -0,0 +1,191 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    merge_dicts,
+    parse_count,
+    str_or_none,
+    try_get,
+    unified_strdate,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class HKETVIE(InfoExtractor):
+    IE_NAME = 'hketv'
+    IE_DESC = '香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau'
+    _GEO_BYPASS = False
+    _GEO_COUNTRIES = ['HK']
+    _VALID_URL = r'https?://(?:www\.)?hkedcity\.net/etv/resource/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'https://www.hkedcity.net/etv/resource/2932360618',
+        'md5': 'f193712f5f7abb208ddef3c5ea6ed0b7',
+        'info_dict': {
+            'id': '2932360618',
+            'ext': 'mp4',
+            'title': '喜閱一生(共享閱讀樂) (中、英文字幕可供選擇)',
+            'description': 'md5:d5286d05219ef50e0613311cbe96e560',
+            'upload_date': '20181024',
+            'duration': 900,
+            'subtitles': 'count:2',
+        },
+        'skip': 'Geo restricted to HK',
+    }, {
+        'url': 'https://www.hkedcity.net/etv/resource/972641418',
+        'md5': '1ed494c1c6cf7866a8290edad9b07dc9',
+        'info_dict': {
+            'id': '972641418',
+            'ext': 'mp4',
+            'title': '衣冠楚楚 (天使系列之一)',
+            'description': 'md5:10bb3d659421e74f58e5db5691627b0f',
+            'upload_date': '20070109',
+            'duration': 907,
+            'subtitles': {},
+        },
+        'params': {
+            'geo_verification_proxy': '<HK proxy here>',
+        },
+        'skip': 'Geo restricted to HK',
+    }]
+
+    _CC_LANGS = {
+        '中文（繁體中文）': 'zh-Hant',
+        '中文（简体中文）': 'zh-Hans',
+        'English': 'en',
+        'Bahasa Indonesia': 'id',
+        '\u0939\u093f\u0928\u094d\u0926\u0940': 'hi',
+        '\u0928\u0947\u092a\u093e\u0932\u0940': 'ne',
+        'Tagalog': 'tl',
+        '\u0e44\u0e17\u0e22': 'th',
+        '\u0627\u0631\u062f\u0648': 'ur',
+    }
+    _FORMAT_HEIGHTS = {
+        'SD': 360,
+        'HD': 720,
+    }
+    _APPS_BASE_URL = 'https://apps.hkedcity.net'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = (
+            self._html_search_meta(
+                ('ed_title', 'search.ed_title'), webpage, default=None) or
+            self._search_regex(
+                r'data-favorite_title_(?:eng|chi)=(["\'])(?P<id>(?:(?!\1).)+)\1',
+                webpage, 'title', default=None, group='url') or
+            self._html_search_regex(
+                r'<h1>([^<]+)</h1>', webpage, 'title', default=None) or
+            self._og_search_title(webpage)
+        )
+
+        file_id = self._search_regex(
+            r'post_var\[["\']file_id["\']\s*\]\s*=\s*(.+?);',
+            webpage, 'file ID')
+        curr_url = self._search_regex(
+            r'post_var\[["\']curr_url["\']\s*\]\s*=\s*"(.+?)";',
+            webpage, 'curr URL')
+        data = {
+            'action': 'get_info',
+            'curr_url': curr_url,
+            'file_id': file_id,
+            'video_url': file_id,
+        }
+
+        response = self._download_json(
+            self._APPS_BASE_URL + '/media/play/handler.php', video_id,
+            data=urlencode_postdata(data),
+            headers=merge_dicts({
+                'Content-Type': 'application/x-www-form-urlencoded'},
+                self.geo_verification_headers()))
+
+        result = response['result']
+
+        if not response.get('success') or not response.get('access'):
+            error = clean_html(response.get('access_err_msg'))
+            if 'Video streaming is not available in your country' in error:
+                self.raise_geo_restricted(
+                    msg=error, countries=self._GEO_COUNTRIES)
+            else:
+                raise ExtractorError(error, expected=True)
+
+        formats = []
+
+        width = int_or_none(result.get('width'))
+        height = int_or_none(result.get('height'))
+
+        playlist0 = result['playlist'][0]
+        for fmt in playlist0['sources']:
+            file_url = urljoin(self._APPS_BASE_URL, fmt.get('file'))
+            if not file_url:
+                continue
+            # If we ever wanted to provide the final resolved URL that
+            # does not require cookies, albeit with a shorter lifespan:
+            #     urlh = self._downloader.urlopen(file_url)
+            #     resolved_url = urlh.geturl()
+            label = fmt.get('label')
+            h = self._FORMAT_HEIGHTS.get(label)
+            w = h * width // height if h and width and height else None
+            formats.append({
+                'format_id': label,
+                'ext': fmt.get('type'),
+                'url': file_url,
+                'width': w,
+                'height': h,
+            })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        tracks = try_get(playlist0, lambda x: x['tracks'], list) or []
+        for track in tracks:
+            if not isinstance(track, dict):
+                continue
+            track_kind = str_or_none(track.get('kind'))
+            if not track_kind or not isinstance(track_kind, compat_str):
+                continue
+            if track_kind.lower() not in ('captions', 'subtitles'):
+                continue
+            track_url = urljoin(self._APPS_BASE_URL, track.get('file'))
+            if not track_url:
+                continue
+            track_label = track.get('label')
+            subtitles.setdefault(self._CC_LANGS.get(
+                track_label, track_label), []).append({
+                    'url': self._proto_relative_url(track_url),
+                    'ext': 'srt',
+                })
+
+        # Likes
+        emotion = self._download_json(
+            'https://emocounter.hkedcity.net/handler.php', video_id,
+            data=urlencode_postdata({
+                'action': 'get_emotion',
+                'data[bucket_id]': 'etv',
+                'data[identifier]': video_id,
+            }),
+            headers={'Content-Type': 'application/x-www-form-urlencoded'},
+            fatal=False) or {}
+        like_count = int_or_none(try_get(
+            emotion, lambda x: x['data']['emotion_data'][0]['count']))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': self._html_search_meta(
+                'description', webpage, fatal=False),
+            'upload_date': unified_strdate(self._html_search_meta(
+                'ed_date', webpage, fatal=False), day_first=False),
+            'duration': int_or_none(result.get('length')),
+            'formats': formats,
+            'subtitles': subtitles,
+            'thumbnail': urljoin(self._APPS_BASE_URL, result.get('image')),
+            'view_count': parse_count(result.get('view_count')),
+            'like_count': like_count,
+        }
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@ -227,44 +227,37 @@ class InstagramIE(InfoExtractor):
        }


-class InstagramUserIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
-    IE_DESC = 'Instagram user profile'
-    IE_NAME = 'instagram:user'
-    _TEST = {
-        'url': 'https://instagram.com/porsche',
-        'info_dict': {
-            'id': 'porsche',
-            'title': 'porsche',
-        },
-        'playlist_count': 5,
-        'params': {
-            'extract_flat': True,
-            'skip_download': True,
-            'playlistend': 5,
-        }
-    }
+class InstagramPlaylistIE(InfoExtractor):
+    # A superclass for handling any kind of query based on GraphQL which
+    # results in a playlist.

-    _gis_tmpl = None
+    _gis_tmpl = None  # used to cache GIS request type

-    def _entries(self, data):
+    def _parse_graphql(self, webpage, item_id):
+        # Reads a webpage and returns its GraphQL data.
+        return self._parse_json(
+            self._search_regex(
+                r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
+            item_id)
+
+    def _extract_graphql(self, data, url):
+        # Parses GraphQL queries containing videos and generates a playlist.
        def get_count(suffix):
            return int_or_none(try_get(
                node, lambda x: x['edge_media_' + suffix]['count']))

-        uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
+        uploader_id = self._match_id(url)
        csrf_token = data['config']['csrf_token']
        rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8'

-        self._set_cookie('instagram.com', 'ig_pr', '1')
-
        cursor = ''
        for page_num in itertools.count(1):
-            variables = json.dumps({
-                'id': uploader_id,
+            variables = {
                'first': 12,
                'after': cursor,
-            })
+            }
+            variables.update(self._query_vars_for(data))
+            variables = json.dumps(variables)

            if self._gis_tmpl:
                gis_tmpls = [self._gis_tmpl]
@ -276,21 +269,26 @@ class InstagramUserIE(InfoExtractor):
                    '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
                ]

+            # try all of the ways to generate a GIS query, and not only use the
+            # first one that works, but cache it for future requests
            for gis_tmpl in gis_tmpls:
                try:
-                    media = self._download_json(
+                    json_data = self._download_json(
                        'https://www.instagram.com/graphql/query/', uploader_id,
                        'Downloading JSON page %d' % page_num, headers={
                            'X-Requested-With': 'XMLHttpRequest',
                            'X-Instagram-GIS': hashlib.md5(
                                ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(),
                        }, query={
-                            'query_hash': '42323d64886122307be10013ad2dcc44',
+                            'query_hash': self._QUERY_HASH,
                            'variables': variables,
-                        })['data']['user']['edge_owner_to_timeline_media']
+                        })
+                    media = self._parse_timeline_from(json_data)
                    self._gis_tmpl = gis_tmpl
                    break
                except ExtractorError as e:
+                    # if it's an error caused by a bad query, and there are
+                    # more GIS templates to try, ignore it and keep trying
                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
                        if gis_tmpl != gis_tmpls[-1]:
                            continue
@ -348,14 +346,80 @@ class InstagramUserIE(InfoExtractor):
                break

    def _real_extract(self, url):
-        username = self._match_id(url)
+        user_or_tag = self._match_id(url)
+        webpage = self._download_webpage(url, user_or_tag)
+        data = self._parse_graphql(webpage, user_or_tag)

-        webpage = self._download_webpage(url, username)
-
-        data = self._parse_json(
-            self._search_regex(
-                r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
-            username)
+        self._set_cookie('instagram.com', 'ig_pr', '1')

        return self.playlist_result(
-            self._entries(data), username, username)
+            self._extract_graphql(data, url), user_or_tag, user_or_tag)
+
+
+class InstagramUserIE(InstagramPlaylistIE):
+    _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
+    IE_DESC = 'Instagram user profile'
+    IE_NAME = 'instagram:user'
+    _TEST = {
+        'url': 'https://instagram.com/porsche',
+        'info_dict': {
+            'id': 'porsche',
+            'title': 'porsche',
+        },
+        'playlist_count': 5,
+        'params': {
+            'extract_flat': True,
+            'skip_download': True,
+            'playlistend': 5,
+        }
+    }
+
+    _QUERY_HASH = '42323d64886122307be10013ad2dcc44',
+
+    @staticmethod
+    def _parse_timeline_from(data):
+        # extracts the media timeline data from a GraphQL result
+        return data['data']['user']['edge_owner_to_timeline_media']
+
+    @staticmethod
+    def _query_vars_for(data):
+        # returns a dictionary of variables to add to the timeline query based
+        # on the GraphQL of the original page
+        return {
+            'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
+        }
+
+
+class InstagramTagIE(InstagramPlaylistIE):
+    _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
+    IE_DESC = 'Instagram hashtag search'
+    IE_NAME = 'instagram:tag'
+    _TEST = {
+        'url': 'https://instagram.com/explore/tags/lolcats',
+        'info_dict': {
+            'id': 'lolcats',
+            'title': 'lolcats',
+        },
+        'playlist_count': 50,
+        'params': {
+            'extract_flat': True,
+            'skip_download': True,
+            'playlistend': 50,
+        }
+    }
+
+    _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314',
+
+    @staticmethod
+    def _parse_timeline_from(data):
+        # extracts the media timeline data from a GraphQL result
+        return data['data']['hashtag']['edge_hashtag_to_media']
+
+    @staticmethod
+    def _query_vars_for(data):
+        # returns a dictionary of variables to add to the timeline query based
+        # on the GraphQL of the original page
+        return {
+            'tag_name':
+                data['entry_data']['TagPage'][0]['graphql']['hashtag']['name']
+        }
--- a/youtube_dl/extractor/radiocanada.py
+++ b/youtube_dl/extractor/radiocanada.py
@ -49,6 +49,16 @@ class RadioCanadaIE(InfoExtractor):
                # m3u8 download
                'skip_download': True,
            },
+        },
+        {
+            # with protectionType but not actually DRM protected
+            'url': 'radiocanada:toutv:140872',
+            'info_dict': {
+                'id': '140872',
+                'title': 'Épisode 1',
+                'series': 'District 31',
+            },
+            'only_matching': True,
        }
    ]

@ -67,8 +77,10 @@ class RadioCanadaIE(InfoExtractor):
            el = find_xpath_attr(metadata, './/Meta', 'name', name)
            return el.text if el is not None else None

+        # protectionType does not necessarily mean the video is DRM protected (see
+        # https://github.com/rg3/youtube-dl/pull/18609).
        if get_meta('protectionType'):
-            raise ExtractorError('This video is DRM protected.', expected=True)
+            self.report_warning('This video is probably DRM protected.')

        device_types = ['ipad']
        if not smuggled_data:
--- a/youtube_dl/extractor/streamango.py
+++ b/youtube_dl/extractor/streamango.py
@ -14,7 +14,7 @@ from ..utils import (


 class StreamangoIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net)/(?:f|embed)/(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',
        'md5': 'e992787515a182f55e38fc97588d802a',
@ -38,6 +38,9 @@ class StreamangoIE(InfoExtractor):
    }, {
        'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
        'only_matching': True,
+    }, {
+        'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4',
+        'only_matching': True,
    }]

    def _real_extract(self, url):
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@ -265,6 +265,8 @@ class TEDIE(InfoExtractor):
                    'format_id': m3u8_format['format_id'].replace('hls', 'http'),
                    'protocol': 'http',
                })
+                if f.get('acodec') == 'none':
+                    del f['acodec']
                formats.append(f)

        audio_download = talk_info.get('audioDownload')
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@ -96,7 +96,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):

        cfg_xml = self._download_xml(
            cfg_url, display_id, 'Downloading metadata',
-            transform_source=fix_xml_ampersands)
+            transform_source=fix_xml_ampersands, headers={'Referer': url})

        formats = []

--- a/youtube_dl/extractor/videomore.py
+++ b/youtube_dl/extractor/videomore.py
@ -4,8 +4,14 @@ from __future__ import unicode_literals
 import re

 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
    int_or_none,
+    orderedSet,
+    parse_duration,
+    str_or_none,
+    unified_strdate,
+    url_or_none,
    xpath_element,
    xpath_text,
 )
@ -13,7 +19,19 @@ from ..utils import (

 class VideomoreIE(InfoExtractor):
    IE_NAME = 'videomore'
-    _VALID_URL = r'videomore:(?P<sid>\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P<id>\d+)(?:[/?#&]|\.(?:xml|json)|$)'
+    _VALID_URL = r'''(?x)
+                    videomore:(?P<sid>\d+)$|
+                    https?://(?:player\.)?videomore\.ru/
+                        (?:
+                            (?:
+                                embed|
+                                [^/]+/[^/]+
+                            )/|
+                            [^/]*\?.*?\btrack_id=
+                        )
+                        (?P<id>\d+)
+                        (?:[/?#&]|\.(?:xml|json)|$)
+                    '''
    _TESTS = [{
        'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617',
        'md5': '44455a346edc0d509ac5b5a5b531dc35',
@ -79,6 +97,9 @@ class VideomoreIE(InfoExtractor):
    }, {
        'url': 'videomore:367617',
        'only_matching': True,
+    }, {
+        'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=',
+        'only_matching': True,
    }]

    @staticmethod
@ -136,7 +157,7 @@ class VideomoreIE(InfoExtractor):

 class VideomoreVideoIE(InfoExtractor):
    IE_NAME = 'videomore:video'
-    _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)[/?#&]*$'
+    _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)(?:/*|[?#&].*?)$'
    _TESTS = [{
        # single video with og:video:iframe
        'url': 'http://videomore.ru/elki_3',
@ -176,6 +197,9 @@ class VideomoreVideoIE(InfoExtractor):
        'params': {
            'skip_download': True,
        },
+    }, {
+        'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so',
+        'only_matching': True,
    }]

    @classmethod
@ -196,13 +220,16 @@ class VideomoreVideoIE(InfoExtractor):
                 r'track-id=["\'](\d+)',
                 r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id')
            video_url = 'videomore:%s' % video_id
+        else:
+            video_id = None

-        return self.url_result(video_url, VideomoreIE.ie_key())
+        return self.url_result(
+            video_url, ie=VideomoreIE.ie_key(), video_id=video_id)


 class VideomoreSeasonIE(InfoExtractor):
    IE_NAME = 'videomore:season'
-    _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)[/?#&]*$'
+    _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$'
    _TESTS = [{
        'url': 'http://videomore.ru/molodezhka/sezon_promo',
        'info_dict': {
@ -210,8 +237,16 @@ class VideomoreSeasonIE(InfoExtractor):
            'title': 'Молодежка Промо',
        },
        'playlist_mincount': 12,
+    }, {
+        'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so',
+        'only_matching': True,
    }]

+    @classmethod
+    def suitable(cls, url):
+        return (False if (VideomoreIE.suitable(url) or VideomoreVideoIE.suitable(url))
+                else super(VideomoreSeasonIE, cls).suitable(url))
+
    def _real_extract(self, url):
        display_id = self._match_id(url)

@ -219,9 +254,54 @@ class VideomoreSeasonIE(InfoExtractor):

        title = self._og_search_title(webpage)

-        entries = [
-            self.url_result(item) for item in re.findall(
-                r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"'
-                % display_id, webpage)]
+        data = self._parse_json(
+            self._html_search_regex(
+                r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P<value>{.+?})\1',
+                webpage, 'data', default='{}', group='value'),
+            display_id, fatal=False)
+
+        entries = []
+
+        if data:
+            episodes = data.get('episodes')
+            if isinstance(episodes, list):
+                for ep in episodes:
+                    if not isinstance(ep, dict):
+                        continue
+                    ep_id = int_or_none(ep.get('id'))
+                    ep_url = url_or_none(ep.get('url'))
+                    if ep_id:
+                        e = {
+                            'url': 'videomore:%s' % ep_id,
+                            'id': compat_str(ep_id),
+                        }
+                    elif ep_url:
+                        e = {'url': ep_url}
+                    else:
+                        continue
+                    e.update({
+                        '_type': 'url',
+                        'ie_key': VideomoreIE.ie_key(),
+                        'title': str_or_none(ep.get('title')),
+                        'thumbnail': url_or_none(ep.get('image')),
+                        'duration': parse_duration(ep.get('duration')),
+                        'episode_number': int_or_none(ep.get('number')),
+                        'upload_date': unified_strdate(ep.get('date')),
+                    })
+                    entries.append(e)
+
+        if not entries:
+            entries = [
+                self.url_result(
+                    'videomore:%s' % video_id, ie=VideomoreIE.ie_key(),
+                    video_id=video_id)
+                for video_id in orderedSet(re.findall(
+                    r':(?:id|key)=["\'](\d+)["\']', webpage))]
+
+        if not entries:
+            entries = [
+                self.url_result(item) for item in re.findall(
+                    r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"'
+                    % display_id, webpage)]

        return self.playlist_result(entries, display_id, title)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@ -436,6 +436,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
            'url': 'https://vimeo.com/160743502/abd0e13fb4',
            'only_matching': True,
        }
+        # https://gettingthingsdone.com/workflowmap/
+        # vimeo embed with check-password page protected by Referer header
    ]

    @staticmethod
@ -466,20 +468,22 @@ class VimeoIE(VimeoBaseInfoExtractor):
        urls = VimeoIE._extract_urls(url, webpage)
        return urls[0] if urls else None

-    def _verify_player_video_password(self, url, video_id):
+    def _verify_player_video_password(self, url, video_id, headers):
        password = self._downloader.params.get('videopassword')
        if password is None:
            raise ExtractorError('This video is protected by a password, use the --video-password option')
        data = urlencode_postdata({
            'password': base64.b64encode(password.encode()),
        })
-        pass_url = url + '/check-password'
-        password_request = sanitized_Request(pass_url, data)
-        password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        password_request.add_header('Referer', url)
-        return self._download_json(
-            password_request, video_id,
-            'Verifying the password', 'Wrong password')
+        headers = merge_dicts(headers, {
+            'Content-Type': 'application/x-www-form-urlencoded',
+        })
+        checked = self._download_json(
+            url + '/check-password', video_id,
+            'Verifying the password', data=data, headers=headers)
+        if checked is False:
+            raise ExtractorError('Wrong video password', expected=True)
+        return checked

    def _real_initialize(self):
        self._login()
@ -592,7 +596,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                                     cause=e)
        else:
            if config.get('view') == 4:
-                config = self._verify_player_video_password(redirect_url, video_id)
+                config = self._verify_player_video_password(redirect_url, video_id, headers)

        vod = config.get('video', {}).get('vod', {})

--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -1868,7 +1868,7 @@ def urljoin(base, path):
        path = path.decode('utf-8')
    if not isinstance(path, compat_str) or not path:
        return None
-    if re.match(r'^(?:https?:)?//', path):
+    if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
        return path
    if isinstance(base, bytes):
        base = base.decode('utf-8')