From b89ac534555692c3c29a57d97ec0bda3bef3b086 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 21 May 2018 17:46:52 +0100 Subject: [PATCH 01/20] [globo] use compat_str --- youtube_dl/extractor/globo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 8e6c38742..81d6d36d3 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -126,16 +126,16 @@ class GloboIE(InfoExtractor): continue hash_code = security_hash[:2] - received_time = int(security_hash[2:12]) + received_time = security_hash[2:12] received_random = security_hash[12:22] received_md5 = security_hash[22:] - sign_time = received_time + 86400 + sign_time = compat_str(int(received_time) + 86400) padding = '%010d' % random.randint(1, 10000000000) - md5_data = (received_md5 + str(sign_time) + padding + '0xFF01DD').encode() + md5_data = (received_md5 + sign_time + padding + '0xFF01DD').encode() signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=') - signed_hash = hash_code + compat_str(received_time) + received_random + compat_str(sign_time) + padding + signed_md5 + signed_hash = hash_code + received_time + received_random + sign_time + padding + signed_md5 signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): From 57d6792024f2670a21f923dfbd81614a1ee6b735 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 11:27:36 +0100 Subject: [PATCH 02/20] [viewlift] fix extraction for snagfils.com(closes #15766) --- youtube_dl/extractor/viewlift.py | 164 ++++++++++++++++++++++--------- 1 file changed, 115 insertions(+), 49 deletions(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 1f29c273f..e466156f6 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -1,24 +1,27 @@ from __future__ import unicode_literals +import base64 import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, clean_html, determine_ext, int_or_none, js_to_json, + parse_age_limit, parse_duration, ) class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv' + _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com' class ViewLiftEmbedIE(ViewLiftBaseIE): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -60,8 +63,10 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): formats = [] has_bitrate = False - for source in self._parse_json(js_to_json(self._search_regex( - r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): + sources = self._parse_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', webpage, + 'sources', default='[]'), video_id, js_to_json) + for source in sources: file_ = source.get('file') if not file_: continue @@ -70,7 +75,8 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): format_id = source.get('label') or ext if all(v in ('m3u8', 'hls') for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', m3u8_id='hls')) + file_, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: bitrate = int_or_none(self._search_regex( [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], @@ -85,6 +91,13 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'tbr': bitrate, 'height': height, }) + if not formats: + hls_url = self._parse_json(self._search_regex( + r'filmInfo\.src\s*=\s*({.+?});', + webpage, 'src'), video_id, js_to_json)['src'] + formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') self._sort_formats(formats, field_preference) @@ -109,10 +122,13 @@ class ViewLiftIE(ViewLiftBaseIE): 'display_id': 'lost_for_life', 'ext': 'mp4', 'title': 'Lost for Life', - 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', + 'description': 'md5:ea10b5a50405ae1f7b5269a6ec594102', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 4489, - 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] + 'categories': 'mincount:3', + 'age_limit': 14, + 'upload_date': '20150421', + 'timestamp': 1429656819, } }, { 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', @@ -125,7 +141,9 @@ class ViewLiftIE(ViewLiftBaseIE): 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 979, - 'categories': ['Documentary', 'Sports', 'Politics'] + 'categories': 'mincount:2', + 'timestamp': 1399478279, + 'upload_date': '20140507', } }, { # Film is not playable in your area. @@ -138,9 +156,6 @@ class ViewLiftIE(ViewLiftBaseIE): }, { 'url': 'http://www.winnersview.com/videos/the-good-son', 'only_matching': True, - }, { - 'url': 'http://www.kesari.tv/news/video/1461919076414', - 'only_matching': True, }, { # Was once Kaltura embed 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15', @@ -156,45 +171,96 @@ class ViewLiftIE(ViewLiftBaseIE): raise ExtractorError( 'Film %s is not available.' % display_id, expected=True) - film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + initial_store_state = self._search_regex( + r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)", + webpage, 'Initial Store State', default=None) + if initial_store_state: + modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode( + initial_store_state).decode()), display_id)['page']['data']['modules'] + content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule') + gist = content_data['gist'] + film_id = gist['id'] + title = gist['title'] + video_assets = content_data['streamingInfo']['videoAssets'] - snag = self._parse_json( - self._search_regex( - r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), - display_id) + formats = [] + mpeg_video_assets = video_assets.get('mpeg') or [] + for video_asset in mpeg_video_assets: + video_asset_url = video_asset.get('url') + if not video_asset: + continue + bitrate = int_or_none(video_asset.get('bitrate')) + height = int_or_none(self._search_regex( + r'^_?(\d+)[pP]$', video_asset.get('renditionValue'), + 'height', default=None)) + formats.append({ + 'url': video_asset_url, + 'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''), + 'tbr': bitrate, + 'height': height, + 'vcodec': video_asset.get('codec'), + }) - for item in snag: - if item.get('data', {}).get('film', {}).get('id') == film_id: - data = item['data']['film'] - title = data['title'] - description = clean_html(data.get('synopsis')) - thumbnail = data.get('image') - duration = int_or_none(data.get('duration') or data.get('runtime')) - categories = [ - category['title'] for category in data.get('categories', []) - if category.get('title')] - break + hls_url = video_assets.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'format_id')) + + info = { + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': gist.get('description'), + 'thumbnail': gist.get('videoImageUrl'), + 'duration': int_or_none(gist.get('runtime')), + 'age_limit': parse_age_limit(content_data.get('parentalRating', '').replace('_', '-')), + 'timestamp': int_or_none(gist.get('publishDate'), 1000), + 'formats': formats, + } + for k in ('categories', 'tags'): + info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')] + return info else: - title = self._search_regex( - r'itemprop="title">([^<]+)<', webpage, 'title') - description = self._html_search_regex( - r'(?s)
(.+?)
', - webpage, 'description', default=None) or self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - duration = parse_duration(self._search_regex( - r'([^<]+)<', - webpage, 'duration', fatal=False)) - categories = re.findall(r'([^<]+)', webpage) + film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') - return { - '_type': 'url_transparent', - 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), - 'id': film_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'categories': categories, - 'ie_key': 'ViewLiftEmbed', - } + snag = self._parse_json( + self._search_regex( + r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'), + display_id) + + for item in snag: + if item.get('data', {}).get('film', {}).get('id') == film_id: + data = item['data']['film'] + title = data['title'] + description = clean_html(data.get('synopsis')) + thumbnail = data.get('image') + duration = int_or_none(data.get('duration') or data.get('runtime')) + categories = [ + category['title'] for category in data.get('categories', []) + if category.get('title')] + break + else: + title = self._search_regex( + r'itemprop="title">([^<]+)<', webpage, 'title') + description = self._html_search_regex( + r'(?s)
(.+?)
', + webpage, 'description', default=None) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'([^<]+)<', + webpage, 'duration', fatal=False)) + categories = re.findall(r'([^<]+)', webpage) + + return { + '_type': 'url_transparent', + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'categories': categories, + 'ie_key': 'ViewLiftEmbed', + } From b836118724122a639a1cb78d55d91724bf1e7251 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:12:20 +0100 Subject: [PATCH 03/20] [utils] Relax TV Parental Guidelines matching --- youtube_dl/utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f9ca63c58..d61af8837 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2253,12 +2253,12 @@ US_RATINGS = { TV_PARENTAL_GUIDELINES = { - 'TV-Y': 0, - 'TV-Y7': 7, - 'TV-G': 0, - 'TV-PG': 0, - 'TV-14': 14, - 'TV-MA': 17, + 'Y': 0, + 'Y7': 7, + 'G': 0, + 'PG': 0, + '14': 14, + 'MA': 17, } @@ -2272,7 +2272,10 @@ def parse_age_limit(s): return int(m.group('age')) if s in US_RATINGS: return US_RATINGS[s] - return TV_PARENTAL_GUIDELINES.get(s) + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(TV_PARENTAL_GUIDELINES.keys()), s) + if m: + return TV_PARENTAL_GUIDELINES[m.group(1)] + return None def strip_jsonp(code): From 670dcba8c73ee69545513522676b2c480bc48662 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:13:44 +0100 Subject: [PATCH 04/20] [viewlift] Remove rating format transformation --- youtube_dl/extractor/viewlift.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index e466156f6..51a002b11 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -214,7 +214,7 @@ class ViewLiftIE(ViewLiftBaseIE): 'description': gist.get('description'), 'thumbnail': gist.get('videoImageUrl'), 'duration': int_or_none(gist.get('runtime')), - 'age_limit': parse_age_limit(content_data.get('parentalRating', '').replace('_', '-')), + 'age_limit': parse_age_limit(content_data.get('parentalRating')), 'timestamp': int_or_none(gist.get('publishDate'), 1000), 'formats': formats, } From 268e132dec96ea9e8a9a3cafb788baf39a498c7d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:15:21 +0100 Subject: [PATCH 05/20] [go90] extract age limit and detect drm protection(#10127) --- youtube_dl/extractor/go90.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 9b2e1c164..35dde42d0 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -6,7 +6,9 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + ExtractorError, int_or_none, + parse_age_limit, parse_iso8601, ) @@ -23,6 +25,7 @@ class Go90IE(InfoExtractor): 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.', 'timestamp': 1491868800, 'upload_date': '20170411', + 'age_limit': 14, } } @@ -33,6 +36,8 @@ class Go90IE(InfoExtractor): video_id, headers={ 'Content-Type': 'application/json; charset=utf-8', }, data=b'{"client":"web","device_type":"pc"}') + if video_data.get('requires_drm'): + raise ExtractorError('This video is DRM protected.', expected=True) main_video_asset = video_data['main_video_asset'] episode_number = int_or_none(video_data.get('episode_number')) @@ -123,4 +128,5 @@ class Go90IE(InfoExtractor): 'season_number': season_number, 'episode_number': episode_number, 'subtitles': subtitles, + 'age_limit': parse_age_limit(video_data.get('rating')), } From 3bb3ff38a15ccf00686c75af8d6635903632ee87 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 12:20:05 +0100 Subject: [PATCH 06/20] [test_utils] add tests for b836118724122a639a1cb78d55d91724bf1e7251 --- test/test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 14503ab53..f2b51131c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -519,6 +519,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_age_limit('PG-13'), 13) self.assertEqual(parse_age_limit('TV-14'), 14) self.assertEqual(parse_age_limit('TV-MA'), 17) + self.assertEqual(parse_age_limit('TV14'), 14) + self.assertEqual(parse_age_limit('TV_G'), 0) def test_parse_duration(self): self.assertEqual(parse_duration(None), None) From ca0aef42d4fa77123c56c19ef3fe2673645391a2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 23 May 2018 23:04:12 +0100 Subject: [PATCH 07/20] [viewlift] add support for hoichoi.tv(closes #16536) --- youtube_dl/extractor/viewlift.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 51a002b11..c43d1a1e8 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -17,7 +17,7 @@ from ..utils import ( class ViewLiftBaseIE(InfoExtractor): - _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com' + _DOMAINS_REGEX = r'(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv' class ViewLiftEmbedIE(ViewLiftBaseIE): From 1139935db78b610d15ade2b667e2a07b4df0ecf0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 24 May 2018 02:51:47 +0100 Subject: [PATCH 08/20] [nbc] add support for stream.nbcsports.com(closes #13911) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nbc.py | 62 +++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7d5927131..52e330955 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -666,6 +666,7 @@ from .nbc import ( NBCOlympicsIE, NBCOlympicsStreamIE, NBCSportsIE, + NBCSportsStreamIE, NBCSportsVPlayerIE, ) from .ndr import ( diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 1b1722cfa..c843f8649 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,7 +1,8 @@ from __future__ import unicode_literals -import re import base64 +import json +import re from .common import InfoExtractor from .theplatform import ThePlatformIE @@ -175,6 +176,65 @@ class NBCSportsIE(InfoExtractor): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') +class NBCSportsStreamIE(AdobePassIE): + _VALID_URL = r'https?://stream\.nbcsports\.com/.+?\bpid=(?P\d+)' + _TEST = { + 'url': 'http://stream.nbcsports.com/nbcsn/generic?pid=206559', + 'info_dict': { + 'id': '206559', + 'ext': 'mp4', + 'title': 'Amgen Tour of California Women\'s Recap', + 'description': 'md5:66520066b3b5281ada7698d0ea2aa894', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Requires Adobe Pass Authentication', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + live_source = self._download_json( + 'http://stream.nbcsports.com/data/live_sources_%s.json' % video_id, + video_id) + video_source = live_source['videoSources'][0] + title = video_source['title'] + source_url = None + for k in ('source', 'msl4source', 'iossource', 'hlsv4'): + sk = k + 'Url' + source_url = video_source.get(sk) or video_source.get(sk + 'Alt') + if source_url: + break + else: + source_url = video_source['ottStreamUrl'] + is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live' + resource = self._get_mvpd_resource('nbcsports', title, video_id, '') + token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource) + tokenized_url = self._download_json( + 'https://token.playmakerservices.com/cdn', + video_id, data=json.dumps({ + 'requestorId': 'nbcsports', + 'pid': video_id, + 'application': 'NBCSports', + 'version': 'v1', + 'platform': 'desktop', + 'cdn': 'akamai', + 'url': video_source['sourceUrl'], + 'token': base64.b64encode(token.encode()).decode(), + 'resourceId': base64.b64encode(resource.encode()).decode(), + }).encode())['tokenizedUrl'] + formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': live_source.get('description'), + 'formats': formats, + 'is_live': is_live, + } + + class CSNNEIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' From e8e58c22786918f93e6928d86b878fdc56461c4d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 24 May 2018 11:53:42 +0100 Subject: [PATCH 09/20] [hidive] add support for authentication(closes #16534) --- youtube_dl/extractor/hidive.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index eee517071..d8f2e682f 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -17,6 +17,9 @@ class HiDiveIE(InfoExtractor): # Using X-Forwarded-For results in 403 HTTP error for HLS fragments, # so disabling geo bypass completely _GEO_BYPASS = False + _NETRC_MACHINE = 'hidive' + _LOGGED_IN = False + _LOGIN_URL = 'https://www.hidive.com/account/login' _TESTS = [{ 'url': 'https://www.hidive.com/stream/the-comic-artist-and-his-assistants/s01e001', @@ -31,8 +34,30 @@ class HiDiveIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Requires Authentication', }] + def _real_initialize(self): + if self._LOGGED_IN: + return + + (email, password) = self._get_login_info() + if email is None: + return + + webpage = self._download_webpage(self._LOGIN_URL, None) + form = self._search_regex( + r'(?s)]+action="/account/login"[^>]*>(.+?)', + webpage, 'login form') + data = self._hidden_inputs(form) + data.update({ + 'Email': email, + 'Password': password, + }) + self._download_webpage( + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) + self._LOGGED_IN = True + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title, key = mobj.group('title', 'key') @@ -43,6 +68,7 @@ class HiDiveIE(InfoExtractor): data=urlencode_postdata({ 'Title': title, 'Key': key, + 'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783', })) restriction = settings.get('restrictionReason') @@ -79,6 +105,7 @@ class HiDiveIE(InfoExtractor): subtitles.setdefault(cc_lang, []).append({ 'url': cc_url, }) + self._sort_formats(formats) season_number = int_or_none(self._search_regex( r's(\d+)', key, 'season number', default=None)) From 3d2a643fdcba126b209b758f2e403742ee631cf3 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Thu, 24 May 2018 11:15:03 +0200 Subject: [PATCH 10/20] [imgur] Fix extraction --- youtube_dl/extractor/imgur.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 67c24a51c..2901960a5 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, @@ -21,7 +20,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The most awesome images on the Internet.', + 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/A61SaA1', @@ -29,7 +28,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The most awesome images on the Internet.', + 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/gallery/YcAQlkx', @@ -37,8 +36,6 @@ class ImgurIE(InfoExtractor): 'id': 'YcAQlkx', 'ext': 'mp4', 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - 'description': 'Imgur: The most awesome images on the Internet.' - } }, { 'url': 'http://imgur.com/topic/Funny/N8rOudd', @@ -50,8 +47,8 @@ class ImgurIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - compat_urlparse.urljoin(url, video_id), video_id) + gifv_url = 'https://i.imgur.com/{id}.gifv'.format(id=video_id) + webpage = self._download_webpage(gifv_url, video_id) width = int_or_none(self._og_search_property( 'video:width', webpage, default=None)) @@ -107,7 +104,7 @@ class ImgurIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'description': self._og_search_description(webpage), + 'description': self._og_search_description(webpage, default=None), 'title': self._og_search_title(webpage), } From c561b75c82247188e010b6b53c118bb26b4daaf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 00:09:15 +0700 Subject: [PATCH 11/20] [peertube] Add extractor (closes #16301, closes #16329) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/peertube.py | 210 +++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 youtube_dl/extractor/peertube.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 52e330955..374aa185c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -811,6 +811,7 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .pearvideo import PearVideoIE +from .peertube import PeerTubeIE from .people import PeopleIE from .performgroup import PerformGroupIE from .periscope import ( diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py new file mode 100644 index 000000000..b086f6f5a --- /dev/null +++ b/youtube_dl/extractor/peertube.py @@ -0,0 +1,210 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_resolution, + try_get, + unified_timestamp, + urljoin, +) + + +class PeerTubeIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + # Taken from https://instances.joinpeertube.org/instances + tube\.openalgeria\.org| + peertube\.pointsecu\.fr| + peertube\.nogafa\.org| + peertube\.pl| + megatube\.lilomoino\.fr| + peertube\.tamanoir\.foucry\.net| + peertube\.inapurna\.org| + peertube\.netzspielplatz\.de| + video\.deadsuperhero\.com| + peertube\.devosi\.org| + peertube\.1312\.media| + tube\.worldofhauru\.xyz| + tube\.bootlicker\.party| + skeptikon\.fr| + peertube\.geekshell\.fr| + tube\.opportunis\.me| + peertube\.peshane\.net| + video\.blueline\.mg| + tube\.homecomputing\.fr| + videos\.cloudfrancois\.fr| + peertube\.viviers-fibre\.net| + tube\.ouahpiti\.info| + video\.tedomum\.net| + video\.g3l\.org| + fontube\.fr| + peertube\.gaialabs\.ch| + peertube\.extremely\.online| + peertube\.public-infrastructure\.eu| + tube\.kher\.nl| + peertube\.qtg\.fr| + tube\.22decembre\.eu| + facegirl\.me| + video\.migennes\.net| + janny\.moe| + tube\.p2p\.legal| + video\.atlanti\.se| + troll\.tv| + peertube\.geekael\.fr| + vid\.leotindall\.com| + video\.anormallostpod\.ovh| + p-tube\.h3z\.jp| + tube\.darfweb\.eu| + videos\.iut-orsay\.fr| + peertube\.solidev\.net| + videos\.symphonie-of-code\.fr| + testtube\.ortg\.de| + videos\.cemea\.org| + peertube\.gwendalavir\.eu| + video\.passageenseine\.fr| + videos\.festivalparminous\.org| + peertube\.touhoppai\.moe| + peertube\.duckdns\.org| + sikke\.fi| + peertube\.mastodon\.host| + firedragonvideos\.com| + vidz\.dou\.bet| + peertube\.koehn\.com| + peer\.hostux\.social| + share\.tube| + peertube\.walkingmountains\.fr| + medias\.libox\.fr| + peertube\.moe| + peertube\.xyz| + jp\.peertube\.network| + videos\.benpro\.fr| + tube\.otter\.sh| + peertube\.angristan\.xyz| + peertube\.parleur\.net| + peer\.ecutsa\.fr| + peertube\.heraut\.eu| + peertube\.tifox\.fr| + peertube\.maly\.io| + vod\.mochi\.academy| + exode\.me| + coste\.video| + tube\.aquilenet\.fr| + peertube\.gegeweb\.eu| + framatube\.org| + thinkerview\.video| + tube\.conferences-gesticulees\.net| + peertube\.datagueule\.tv| + video\.lqdn\.fr| + meilleurtube\.delire\.party| + tube\.mochi\.academy| + peertube\.dav\.li| + media\.zat\.im| + pytu\.be| + peertube\.valvin\.fr| + peertube\.nsa\.ovh| + video\.colibris-outilslibres\.org| + video\.hispagatos\.org| + tube\.svnet\.fr| + peertube\.video| + videos\.lecygnenoir\.info| + peertube3\.cpy\.re| + peertube2\.cpy\.re| + videos\.tcit\.fr| + peertube\.cpy\.re + ) + /videos/watch/(?P[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'md5': '80f24ff364cc9d333529506a263e7feb', + 'info_dict': { + 'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c', + 'ext': 'mp4', + 'title': 'wow', + 'description': 'wow such video, so gif', + 'thumbnail': r're:https?://.*\.(?:jpg|png)', + 'timestamp': 1519297480, + 'upload_date': '20180222', + 'uploader': 'Luclu7', + 'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1', + 'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7', + 'license': 'Unknown', + 'duration': 3, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': list, + 'categories': list, + } + }, { + 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', + 'only_matching': True, + }, { + # nsfw + 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + urljoin(url, '/api/v1/videos/%s' % video_id), video_id) + + title = video['name'] + + formats = [] + for file_ in video['files']: + if not isinstance(file_, dict): + continue + file_url = file_.get('fileUrl') + if not file_url or not isinstance(file_url, compat_str): + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + formats.append(f) + self._sort_formats(formats) + + def account_data(field): + return try_get(video, lambda x: x['account'][field], compat_str) + + category = try_get(video, lambda x: x['category']['label'], compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName'), + 'uploader_id': account_data('uuid'), + 'uploder_url': account_data('url'), + 'license': try_get( + video, lambda x: x['licence']['label'], compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + 'formats': formats, + } From f2fc63a5a873391b9ac15642507a2eae71e42906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 00:15:38 +0700 Subject: [PATCH 12/20] [peertube] Add support for embed and API URLs --- youtube_dl/extractor/peertube.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index b086f6f5a..61c41add0 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -116,7 +116,8 @@ class PeerTubeIE(InfoExtractor): videos\.tcit\.fr| peertube\.cpy\.re ) - /videos/watch/(?P[^/?#&]+) + /(?:videos/(?:watch|embed)|api/v\d/videos)/ + (?P[^/?#&]+) ''' _TESTS = [{ 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', @@ -147,6 +148,12 @@ class PeerTubeIE(InfoExtractor): # nsfw 'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39', 'only_matching': True, + }, { + 'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7', + 'only_matching': True, + }, { + 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', + 'only_matching': True, }] def _real_extract(self, url): From 6bd499e8ca769cf69c4b24fa2d7a751d7869b679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 00:28:30 +0700 Subject: [PATCH 13/20] [peertube] Add support for generic embeds --- youtube_dl/extractor/generic.py | 15 +++++++++++++++ youtube_dl/extractor/peertube.py | 23 +++++++++++++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 76852f9dc..47ac139c9 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -108,6 +108,7 @@ from .yapfiles import YapFilesIE from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE +from .peertube import PeerTubeIE class GenericIE(InfoExtractor): @@ -2012,6 +2013,15 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # PeerTube embed + 'url': 'https://joinpeertube.org/fr/home/', + 'info_dict': { + 'id': 'home', + 'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube', + }, + 'playlist_count': 2, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3029,6 +3039,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) + peertube_urls = PeerTubeIE._extract_urls(webpage) + if peertube_urls: + return self.playlist_from_matches( + peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 61c41add0..a481b3151 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -13,9 +15,7 @@ from ..utils import ( class PeerTubeIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?: + _INSTANCES_RE = r'''(?: # Taken from https://instances.joinpeertube.org/instances tube\.openalgeria\.org| peertube\.pointsecu\.fr| @@ -115,10 +115,13 @@ class PeerTubeIE(InfoExtractor): peertube2\.cpy\.re| videos\.tcit\.fr| peertube\.cpy\.re - ) + )''' + _VALID_URL = r'''(?x) + https?:// + %s /(?:videos/(?:watch|embed)|api/v\d/videos)/ - (?P[^/?#&]+) - ''' + (?P[^/?\#&]+) + ''' % _INSTANCES_RE _TESTS = [{ 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', 'md5': '80f24ff364cc9d333529506a263e7feb', @@ -156,6 +159,14 @@ class PeerTubeIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'''(?x)]+\bsrc=(["\'])(?P(?:https?:)?//%s/videos/embed/[^/?\#&]+)\1''' + % PeerTubeIE._INSTANCES_RE, webpage)] + def _real_extract(self, url): video_id = self._match_id(url) From b39f42ee92a3cd669da24db9798e1dc9b574720f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A1s=20Veres-Szentkir=C3=A1lyi?= Date: Fri, 25 May 2018 19:46:05 +0200 Subject: [PATCH 14/20] [indavideo] Sign download URLs --- youtube_dl/extractor/indavideo.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 11cf3c609..15b766fb2 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -2,10 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, parse_age_limit, parse_iso8601, + update_url_query, ) @@ -58,11 +60,10 @@ class IndavideoEmbedIE(InfoExtractor): if flv_url not in video_urls: video_urls.append(flv_url) - formats = [{ - 'url': video_url, - 'height': int_or_none(self._search_regex( - r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)), - } for video_url in video_urls] + filesh = video.get('filesh') + formats = [ + self.video_url_to_format(video_url, filesh) + for video_url in video_urls] self._sort_formats(formats) timestamp = video.get('date') @@ -90,6 +91,18 @@ class IndavideoEmbedIE(InfoExtractor): 'formats': formats, } + def video_url_to_format(self, video_url, filesh): + height = int_or_none(self._search_regex( + r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) + if height and filesh: + token = filesh.get(compat_str(height)) + if token is not None: + video_url = update_url_query(video_url, {'token': token}) + return { + 'url': video_url, + 'height': height, + } + class IndavideoIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' From 2a7c6befc16f72df5368cb4adccd1cd84fd432d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 01:09:44 +0700 Subject: [PATCH 15/20] [indavideo] Fix extraction (closes #11221) --- youtube_dl/extractor/indavideo.py | 48 +++++++++++++++++++------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 15b766fb2..2946c7b84 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -15,7 +15,7 @@ class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P[\da-f]+)' _TESTS = [{ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', - 'md5': 'f79b009c66194acacd40712a6778acfa', + 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', 'info_dict': { 'id': '1837039', 'ext': 'mp4', @@ -47,7 +47,14 @@ class IndavideoEmbedIE(InfoExtractor): title = video['title'] - video_urls = video.get('video_files', []) + video_urls = [] + + video_files = video.get('video_files') + if isinstance(video_files, list): + video_urls.extend(video_files) + elif isinstance(video_files, dict): + video_urls.extend(video_files.values()) + video_file = video.get('video_file') if video: video_urls.append(video_file) @@ -61,9 +68,22 @@ class IndavideoEmbedIE(InfoExtractor): video_urls.append(flv_url) filesh = video.get('filesh') - formats = [ - self.video_url_to_format(video_url, filesh) - for video_url in video_urls] + + formats = [] + for video_url in video_urls: + height = int_or_none(self._search_regex( + r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) + if filesh: + if not height: + continue + token = filesh.get(compat_str(height)) + if token is None: + continue + video_url = update_url_query(video_url, {'token': token}) + formats.append({ + 'url': video_url, + 'height': height, + }) self._sort_formats(formats) timestamp = video.get('date') @@ -91,18 +111,6 @@ class IndavideoEmbedIE(InfoExtractor): 'formats': formats, } - def video_url_to_format(self, video_url, filesh): - height = int_or_none(self._search_regex( - r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) - if height and filesh: - token = filesh.get(compat_str(height)) - if token is not None: - video_url = update_url_query(video_url, {'token': token}) - return { - 'url': video_url, - 'height': height, - } - class IndavideoIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' @@ -122,7 +130,7 @@ class IndavideoIE(InfoExtractor): 'upload_date': '20140127', 'duration': 7, 'age_limit': 0, - 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], + 'tags': list, }, }, { 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz', @@ -146,7 +154,9 @@ class IndavideoIE(InfoExtractor): webpage = self._download_webpage(url, display_id) embed_url = self._search_regex( - r']+rel="video_src"[^>]+href="(.+?)"', webpage, 'embed url') + (r']+\bsrc=(["\'])(?P(?:https?:)?//embed\.indavideo\.hu/player/video/.+?)\1', + r']+rel="video_src"[^>]+href="(?P.+?)"'), + webpage, 'embed url', group='url') return { '_type': 'url_transparent', From aee36ca832ec3a5696c40707098d97be0353e997 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 01:25:40 +0700 Subject: [PATCH 16/20] [indavideo] Add support for generic embeds (closes #11989) --- youtube_dl/extractor/extractors.py | 5 +-- youtube_dl/extractor/generic.py | 24 ++++++++++ youtube_dl/extractor/indavideo.py | 70 +++++++----------------------- 3 files changed, 41 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 374aa185c..c9b49a0cd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -469,10 +469,7 @@ from .imgur import ( ) from .ina import InaIE from .inc import IncIE -from .indavideo import ( - IndavideoIE, - IndavideoEmbedIE, -) +from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internazionale import InternazionaleIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 47ac139c9..0292e0458 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -109,6 +109,7 @@ from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE +from .indavideo import IndavideoEmbedIE class GenericIE(InfoExtractor): @@ -2022,6 +2023,24 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 2, }, + { + # Indavideo embed + 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/', + 'info_dict': { + 'id': '1693903', + 'ext': 'mp4', + 'title': 'Így kell otthon hamburgert sütni', + 'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7', + 'timestamp': 1426330212, + 'upload_date': '20150314', + 'uploader': 'StreetKitchen', + 'uploader_id': '546363', + }, + 'add_ie': [IndavideoEmbedIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3044,6 +3063,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) + if indavideo_urls: + return self.playlist_from_matches( + indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 2946c7b84..2b5b2b5b0 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -38,6 +40,20 @@ class IndavideoEmbedIE(InfoExtractor): 'only_matching': True, }] + # Some example URLs covered by generic extractor: + # http://indavideo.hu/video/Vicces_cica_1 + # http://index.indavideo.hu/video/2015_0728_beregszasz + # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # http://erotika.indavideo.hu/video/Amator_tini_punci + # http://film.indavideo.hu/video/f_hrom_nagymamm_volt + # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\'](?P(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) @@ -110,57 +126,3 @@ class IndavideoEmbedIE(InfoExtractor): 'tags': tags, 'formats': formats, } - - -class IndavideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' - _TESTS = [{ - 'url': 'http://indavideo.hu/video/Vicces_cica_1', - 'md5': '8c82244ba85d2a2310275b318eb51eac', - 'info_dict': { - 'id': '1335611', - 'display_id': 'Vicces_cica_1', - 'ext': 'mp4', - 'title': 'Vicces cica', - 'description': 'Játszik a tablettel. :D', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Jet_Pack', - 'uploader_id': '491217', - 'timestamp': 1390821212, - 'upload_date': '20140127', - 'duration': 7, - 'age_limit': 0, - 'tags': list, - }, - }, { - 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz', - 'only_matching': True, - }, { - 'url': 'http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko', - 'only_matching': True, - }, { - 'url': 'http://erotika.indavideo.hu/video/Amator_tini_punci', - 'only_matching': True, - }, { - 'url': 'http://film.indavideo.hu/video/f_hrom_nagymamm_volt', - 'only_matching': True, - }, { - 'url': 'http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - embed_url = self._search_regex( - (r']+\bsrc=(["\'])(?P(?:https?:)?//embed\.indavideo\.hu/player/video/.+?)\1', - r']+rel="video_src"[^>]+href="(?P.+?)"'), - webpage, 'embed url', group='url') - - return { - '_type': 'url_transparent', - 'ie_key': 'IndavideoEmbed', - 'url': embed_url, - 'display_id': display_id, - } From f4d261b765a17ef2beccec78680ec693c7df014c Mon Sep 17 00:00:00 2001 From: Enes Date: Tue, 24 Apr 2018 22:48:40 +0300 Subject: [PATCH 17/20] [izlesene] Fix extraction (closes #16233) --- youtube_dl/extractor/izlesene.py | 33 ++++++++++---------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index b1d72177d..5b2095490 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( @@ -72,7 +70,7 @@ class IzleseneIE(InfoExtractor): 'uploadDate', webpage, 'upload date')) duration = float_or_none(self._html_search_regex( - r'"videoduration"\s*:\s*"([^"]+)"', + r'videoduration\s*=\s*\'([^\']+)\'', webpage, 'duration', fatal=False), scale=1000) view_count = str_to_int(get_element_by_id('videoViewCount', webpage)) @@ -80,29 +78,18 @@ class IzleseneIE(InfoExtractor): r'comment_count\s*=\s*\'([^\']+)\';', webpage, 'comment_count', fatal=False) - content_url = self._html_search_meta( - 'contentURL', webpage, 'content URL', fatal=False) - ext = determine_ext(content_url, 'mp4') - - # Might be empty for some videos. - streams = self._html_search_regex( - r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='') + streams_json = self._html_search_regex( + r'_videoObj\s*=\s*(.+);', webpage, 'streams') + streams = self._parse_json(streams_json, video_id) formats = [] - if streams: - for stream in streams.split('|'): - quality, url = re.search(r'\[(\w+)\](.+)', stream).groups() - formats.append({ - 'format_id': '%sp' % quality if quality else 'sd', - 'url': compat_urllib_parse_unquote(url), - 'ext': ext, - }) - else: - stream_url = self._search_regex( - r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL') + for stream in streams.get('media').get('level'): + url = stream.get('source') + ext = determine_ext(url, 'mp4') + quality = stream.get('value') formats.append({ - 'format_id': 'sd', - 'url': compat_urllib_parse_unquote(stream_url), + 'format_id': '%sp' % quality, + 'url': compat_urllib_parse_unquote(url), 'ext': ext, }) From 03fad17cb6ae24259808078a165c287c23d77f77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 01:51:38 +0700 Subject: [PATCH 18/20] [izlesene] Improve extraction and fix issues (closes #16407, closes #16271) --- youtube_dl/extractor/izlesene.py | 55 +++++++++++++++++++------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index 5b2095490..f8fca6c8f 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( determine_ext, float_or_none, @@ -55,12 +58,33 @@ class IzleseneIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://www.izlesene.com/video/%s' % video_id - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('http://www.izlesene.com/video/%s' % video_id, video_id) + + video = self._parse_json( + self._search_regex( + r'videoObj\s*=\s*({.+?})\s*;\s*\n', webpage, 'streams'), + video_id) + + title = video.get('videoTitle') or self._og_search_title(webpage) + + formats = [] + for stream in video['media']['level']: + source_url = stream.get('source') + if not source_url or not isinstance(source_url, compat_str): + continue + ext = determine_ext(url, 'mp4') + quality = stream.get('value') + height = int_or_none(quality) + formats.append({ + 'format_id': '%sp' % quality if quality else 'sd', + 'url': compat_urllib_parse_unquote(source_url), + 'ext': ext, + 'height': height, + }) + self._sort_formats(formats) - title = self._og_search_title(webpage) description = self._og_search_description(webpage, default=None) - thumbnail = self._proto_relative_url( + thumbnail = video.get('posterURL') or self._proto_relative_url( self._og_search_thumbnail(webpage), scheme='http:') uploader = self._html_search_regex( @@ -69,30 +93,15 @@ class IzleseneIE(InfoExtractor): timestamp = parse_iso8601(self._html_search_meta( 'uploadDate', webpage, 'upload date')) - duration = float_or_none(self._html_search_regex( - r'videoduration\s*=\s*\'([^\']+)\'', - webpage, 'duration', fatal=False), scale=1000) + duration = float_or_none(video.get('duration') or self._html_search_regex( + r'videoduration["\']?\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'duration', fatal=False, group='value'), scale=1000) view_count = str_to_int(get_element_by_id('videoViewCount', webpage)) comment_count = self._html_search_regex( r'comment_count\s*=\s*\'([^\']+)\';', webpage, 'comment_count', fatal=False) - streams_json = self._html_search_regex( - r'_videoObj\s*=\s*(.+);', webpage, 'streams') - streams = self._parse_json(streams_json, video_id) - - formats = [] - for stream in streams.get('media').get('level'): - url = stream.get('source') - ext = determine_ext(url, 'mp4') - quality = stream.get('value') - formats.append({ - 'format_id': '%sp' % quality, - 'url': compat_urllib_parse_unquote(url), - 'ext': ext, - }) - return { 'id': video_id, 'title': title, From 9ef5cdb5cb637660decbc82117d5d6790c48ad99 Mon Sep 17 00:00:00 2001 From: rhhayward Date: Fri, 25 May 2018 14:13:29 -0500 Subject: [PATCH 19/20] [audiomack] Stringify video id (closes #15310) --- youtube_dl/extractor/audiomack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index f3bd4d444..62049b921 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -65,7 +65,7 @@ class AudiomackIE(InfoExtractor): return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'} return { - 'id': api_response.get('id', album_url_tag), + 'id': compat_str(api_response.get('id', album_url_tag)), 'uploader': api_response.get('artist'), 'title': api_response.get('title'), 'url': api_response['url'], From bdbcc8eecb6d498e5c33dcbfb330d7d82021b3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Nov=C3=A1k?= Date: Fri, 25 May 2018 21:15:50 +0200 Subject: [PATCH 20/20] [dvtv] Remove dead test --- youtube_dl/extractor/dvtv.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index 3f760888e..20996962a 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -91,17 +91,6 @@ class DVTVIE(InfoExtractor): }, { 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/', 'only_matching': True, - }, { - 'url': 'https://video.aktualne.cz/dvtv/babis-a-zeman-nesou-vinu-za-to-ze-nemame-jasno-v-tom-kdo-bud/r~026afb54fad711e79704ac1f6b220ee8/', - 'md5': '87defe16681b1429c91f7a74809823c6', - 'info_dict': { - 'id': 'f5ae72f6fad611e794dbac1f6b220ee8', - 'ext': 'mp4', - 'title': 'Babiš a Zeman nesou vinu za to, že nemáme jasno v tom, kdo bude vládnout, říká Pekarová Adamová', - }, - 'params': { - 'skip_download': True, - }, }] def _parse_video_metadata(self, js, video_id, live_js=None):