From 79367a98208fbf01d6e04b6747a6e01d0b1f8b9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 14 Jul 2018 18:05:06 +0700 Subject: [PATCH 1/8] [pornhub] Improve extraction and extract all formats (closes #12166, closes #15891, closes #16262, closes #16959) --- youtube_dl/extractor/pornhub.py | 125 ++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 23e24d216..97f988da4 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -4,28 +4,21 @@ from __future__ import unicode_literals import functools import itertools import operator -# import os import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - # compat_urllib_parse_unquote, - # compat_urllib_parse_unquote_plus, - # compat_urllib_parse_urlparse, + compat_str, ) from ..utils import ( ExtractorError, int_or_none, js_to_json, orderedSet, - # sanitized_Request, remove_quotes, str_to_int, ) -# from ..aes import ( -# aes_decrypt_text -# ) class PornHubIE(InfoExtractor): @@ -62,7 +55,7 @@ class PornHubIE(InfoExtractor): 'id': '1331683002', 'ext': 'mp4', 'title': '重庆婷婷女王足交', - 'uploader': 'cj397186295', + 'uploader': 'Unknown', 'duration': 1753, 'view_count': int, 'like_count': int, @@ -121,7 +114,7 @@ class PornHubIE(InfoExtractor): self._set_cookie('pornhub.com', 'platform', platform) return self._download_webpage( 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, - video_id) + video_id, 'Downloading %s webpage' % platform) webpage = dl_webpage('pc') @@ -134,48 +127,19 @@ class PornHubIE(InfoExtractor): 'PornHub said: %s' % error_msg, expected=True, video_id=video_id) - tv_webpage = dl_webpage('tv') - - assignments = self._search_regex( - r'(var.+?mediastring.+?)', tv_webpage, - 'encoded url').split(';') - - js_vars = {} - - def parse_js_value(inp): - inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) - if '+' in inp: - inps = inp.split('+') - return functools.reduce( - operator.concat, map(parse_js_value, inps)) - inp = inp.strip() - if inp in js_vars: - return js_vars[inp] - return remove_quotes(inp) - - for assn in assignments: - assn = assn.strip() - if not assn: - continue - assn = re.sub(r'var\s+', '', assn) - vname, value = assn.split('=', 1) - js_vars[vname] = parse_js_value(value) - - video_url = js_vars['mediastring'] - - title = self._search_regex( - r'

([^>]+)

', tv_webpage, 'title', default=None) - # video_title from flashvars contains whitespace instead of non-ASCII (see # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # on that anymore. - title = title or self._html_search_meta( + title = self._html_search_meta( 'twitter:title', webpage, default=None) or self._search_regex( (r']+class=["\']title["\'][^>]*>(?P[^<]+)', r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), webpage, 'title', group='title') + video_urls = [] + video_urls_set = set() + flashvars = self._parse_json( self._search_regex( r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), @@ -183,8 +147,78 @@ class PornHubIE(InfoExtractor): if flashvars: thumbnail = flashvars.get('image_url') duration = int_or_none(flashvars.get('video_duration')) + media_definitions = flashvars.get('mediaDefinitions') + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if not video_url or not isinstance(video_url, compat_str): + continue + if video_url in video_urls_set: + continue + video_urls_set.add(video_url) + video_urls.append( + (video_url, int_or_none(definition.get('quality')))) else: - title, thumbnail, duration = [None] * 3 + thumbnail, duration = [None] * 2 + + if not video_urls: + tv_webpage = dl_webpage('tv') + + assignments = self._search_regex( + r'(var.+?mediastring.+?)</script>', tv_webpage, + 'encoded url').split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + + video_url = js_vars['mediastring'] + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + for mobj in re.finditer( + r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage): + video_url = mobj.group('url') + if video_url not in video_urls_set: + video_urls.append((video_url, None)) + video_urls_set.add(video_url) + + formats = [] + for video_url, height in video_urls: + tbr = None + mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + self._sort_formats(formats) video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', @@ -210,7 +244,6 @@ class PornHubIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'uploader': video_uploader, 'title': title, 'thumbnail': thumbnail, @@ -219,7 +252,7 @@ class PornHubIE(InfoExtractor): 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - # 'formats': formats, + 'formats': formats, 'age_limit': 18, 'tags': tags, 'categories': categories, From 905eef2b06f1e890b1dfd228aa4fa1fa2308d687 Mon Sep 17 00:00:00 2001 From: Jakub Wilk <jwilk@jwilk.net> Date: Wed, 18 Jul 2018 18:47:26 +0200 Subject: [PATCH 2/8] [imgur] Allow digits in filename extension --- youtube_dl/extractor/imgur.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 2901960a5..ecc958a17 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -12,7 +12,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -43,6 +43,9 @@ class ImgurIE(InfoExtractor): }, { 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, + }, { + 'url': 'https://i.imgur.com/crGpqCV.mp4', + 'only_matching': True, }] def _real_extract(self, url): From bd21ead2a20ff16ec8cb10da72526103471069d6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Jul 2018 18:29:18 +0100 Subject: [PATCH 3/8] [extractor/common] add support for DASH and MSS formats extraction in SMIL manifests --- youtube_dl/extractor/common.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5d4db54d5..b8bbaf81a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1859,9 +1859,7 @@ class InfoExtractor(object): 'height': height, }) formats.extend(m3u8_formats) - continue - - if src_ext == 'f4m': + elif src_ext == 'f4m': f4m_url = src_url if not f4m_params: f4m_params = { @@ -1871,9 +1869,13 @@ class InfoExtractor(object): f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) - continue - - if src_url.startswith('http') and self._is_valid_url(src, video_id): + elif src_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src_url, video_id, mpd_id='dash', fatal=False)) + elif re.search(r'\.ism/[Mm]anifest', src_url): + formats.extend(self._extract_ism_formats( + src_url, video_id, ism_id='mss', fatal=False)) + elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, @@ -1884,7 +1886,6 @@ class InfoExtractor(object): 'width': width, 'height': height, }) - continue return formats From 371dcc1dd4b29001910005c1d3e416db204cc262 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Jul 2018 18:31:40 +0100 Subject: [PATCH 4/8] [theplatform] add support for theplatform Top-level domain customization(#16977) --- youtube_dl/extractor/theplatform.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index b1a985ff6..e7dc6071c 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -32,13 +32,14 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): + _TP_TLD = 'com' def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml( smil_url, video_id, note=note, query={'format': 'SMIL'}, headers=self.geo_verification_headers()) error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') if error_element is not None and error_element.attrib['src'].startswith( - 'http://link.theplatform.com/s/errorFiles/Unavailable.'): + 'http://link.theplatform.%s/s/errorFiles/Unavailable.' % self._TP_TLD): raise ExtractorError(error_element.attrib['abstract'], expected=True) smil_formats = self._parse_smil_formats( @@ -66,7 +67,7 @@ class ThePlatformBaseIE(OnceIE): return formats, subtitles def _download_theplatform_metadata(self, path, video_id): - info_url = 'http://link.theplatform.com/s/%s?format=preview' % path + info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path) return self._download_json(info_url, video_id) def _parse_theplatform_metadata(self, info): From 38f1eb0ac3be5d3b61e7722db0024e61cf98eb69 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 18 Jul 2018 18:33:33 +0100 Subject: [PATCH 5/8] [mediaset] fix extraction(closes #16977) --- youtube_dl/extractor/mediaset.py | 155 +++++++++++++++---------------- 1 file changed, 74 insertions(+), 81 deletions(-) diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 9f2b60dcc..57f97409d 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -3,75 +3,75 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..compat import compat_str +from .theplatform import ThePlatformBaseIE from ..utils import ( - determine_ext, - parse_duration, - try_get, - unified_strdate, + ExtractorError, + int_or_none, + update_url_query, ) -class MediasetIE(InfoExtractor): +class MediasetIE(ThePlatformBaseIE): + _TP_TLD = 'eu' _VALID_URL = r'''(?x) (?: mediaset:| https?:// - (?:www\.)?video\.mediaset\.it/ + (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: (?:video|on-demand)/(?:[^/]+/)+[^/]+_| - player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= + player/index\.html\?.*?\bprogramGuid= ) - )(?P<id>[0-9]+) + )(?P<id>[0-9A-Z]{16}) ''' _TESTS = [{ # full episode - 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', + 'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824', 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', 'info_dict': { - 'id': '661824', + 'id': 'FAFU000000661824', 'ext': 'mp4', 'title': 'Quarta puntata', - 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1414, - 'creator': 'mediaset', + 'duration': 1414.26, 'upload_date': '20161107', 'series': 'Hello Goodbye', - 'categories': ['reality'], + 'timestamp': 1478532900, + 'uploader': 'Rete 4', + 'uploader_id': 'R4', }, - 'expected_warnings': ['is not a supported codec'], }, { - 'url': 'http://www.video.mediaset.it/video/matrix/full_chiambretti/puntata-del-25-maggio_846685.html', - 'md5': '1276f966ac423d16ba255ce867de073e', + 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', + 'md5': '288532f0ad18307705b01e581304cd7b', 'info_dict': { - 'id': '846685', + 'id': 'F309013801000501', 'ext': 'mp4', 'title': 'Puntata del 25 maggio', - 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 6565, - 'creator': 'mediaset', - 'upload_date': '20180525', + 'duration': 6565.007, + 'upload_date': '20180526', 'series': 'Matrix', - 'categories': ['infotainment'], + 'timestamp': 1527326245, + 'uploader': 'Canale 5', + 'uploader_id': 'C5', }, 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { # clip - 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', + 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', 'only_matching': True, }, { # iframe simple - 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924', 'only_matching': True, }, { # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) - 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', + 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', 'only_matching': True, }, { - 'url': 'mediaset:661824', + 'url': 'mediaset:FAFU000000665924', 'only_matching': True, }] @@ -84,61 +84,54 @@ class MediasetIE(InfoExtractor): webpage)] def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'https://www.video.mediaset.it/html/metainfo.sjson', - video_id, 'Downloading media info', query={ - 'id': video_id - })['video'] - - title = video['title'] - media_id = video.get('guid') or video_id - - video_list = self._download_json( - 'http://cdnsel01.mediaset.net/GetCdn2018.aspx', - video_id, 'Downloading video CDN JSON', query={ - 'streamid': media_id, - 'format': 'json', - })['videoList'] + guid = self._match_id(url) + tp_path = 'PR1GhC/media/guid/2702976343/' + guid + info = self._extract_theplatform_metadata(tp_path, guid) formats = [] - for format_url in video_list: - ext = determine_ext(format_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - elif ext == 'ism' or '.ism' in format_url: - formats.extend(self._extract_ism_formats( - format_url, video_id, ism_id='mss', fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': determine_ext(format_url), - }) + subtitles = {} + first_e = None + for asset_type in ('SD', 'HD'): + for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'): + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { + 'mbr': 'true', + 'formats': f, + 'assetTypes': asset_type, + }), guid, 'Downloading %s %s SMIL data' % (f, asset_type)) + except ExtractorError as e: + if not first_e: + first_e = e + break + for tp_f in tp_formats: + tp_f['quality'] = 1 if asset_type == 'HD' else 0 + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if first_e and not formats: + raise first_e self._sort_formats(formats) - creator = try_get( - video, lambda x: x['brand-info']['publisher'], compat_str) - category = try_get( - video, lambda x: x['brand-info']['category'], compat_str) - categories = [category] if category else None + fields = [] + for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))): + fields.extend(templ % repl for repl in repls) + feed_data = self._download_json( + 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid, + guid, fatal=False, query={'fields': ','.join(fields)}) + if feed_data: + publish_info = feed_data.get('mediasetprogram$publishInfo') or {} + info.update({ + 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')), + 'season_number': int_or_none(feed_data.get('tvSeasonNumber')), + 'series': feed_data.get('mediasetprogram$brandTitle'), + 'uploader': publish_info.get('description'), + 'uploader_id': publish_info.get('channel'), + 'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')), + }) - return { - 'id': video_id, - 'title': title, - 'description': video.get('short-description'), - 'thumbnail': video.get('thumbnail'), - 'duration': parse_duration(video.get('duration')), - 'creator': creator, - 'upload_date': unified_strdate(video.get('production-date')), - 'webpage_url': video.get('url'), - 'series': video.get('brand-value'), - 'season': video.get('season'), - 'categories': categories, + info.update({ + 'id': guid, 'formats': formats, - } + 'subtitles': subtitles, + }) + return info From c63f5fb8633dda1b0a673c90d537e93497a8d62d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Jul 2018 01:59:00 +0700 Subject: [PATCH 6/8] [slutload] Fix and improve extraction (closes #17001) --- youtube_dl/extractor/slutload.py | 57 +++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py index 6fc2ff60d..661f9e59d 100644 --- a/youtube_dl/extractor/slutload.py +++ b/youtube_dl/extractor/slutload.py @@ -1,12 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class SlutloadIE(InfoExtractor): - _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'md5': '868309628ba00fd488cf516a113fd717', @@ -16,33 +14,52 @@ class SlutloadIE(InfoExtractor): 'title': 'virginie baisee en cam', 'age_limit': 18, 'thumbnail': r're:https?://.*?\.jpg' - } + }, }, { # mobile site 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/', + 'only_matching': True, + }, { + 'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url) - webpage = self._download_webpage(desktop_url, video_id) + embed_page = self._download_webpage( + 'http://www.slutload.com/embed_player/%s' % video_id, video_id, + 'Downloading embed page', fatal=False) - video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>', - webpage, 'title').strip() + if embed_page: + def extract(what): + return self._html_search_regex( + r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what, + embed_page, 'video %s' % what, default=None, group='url') - video_url = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"', - webpage, 'video URL') - thumbnail = self._html_search_regex( - r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"', - webpage, 'thumbnail', fatal=False) + video_url = extract('url') + if video_url: + title = self._html_search_regex( + r'<title>([^<]+)', embed_page, 'title', default=video_id) + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': extract('preview'), + 'age_limit': 18 + } - return { + webpage = self._download_webpage( + 'http://www.slutload.com/video/_/%s/' % video_id, video_id) + title = self._html_search_regex( + r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip() + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ 'id': video_id, - 'url': video_url, - 'title': video_title, - 'thumbnail': thumbnail, - 'age_limit': 18 - } + 'title': title, + 'age_limit': 18, + }) + return info From 8da17f96803faa35ab19352bdcd2777011d8812a Mon Sep 17 00:00:00 2001 From: bato3 <bato3@bandyci.org> Date: Wed, 18 Jul 2018 21:04:05 +0200 Subject: [PATCH 7/8] [dailymotion] Improve description extraction (closes #16984) --- youtube_dl/extractor/dailymotion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 9a74906cb..8f5f57b98 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -144,7 +144,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): age_limit = self._rta_search(webpage) - description = self._og_search_description(webpage) or self._html_search_meta( + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( 'description', webpage, 'description') view_count_str = self._search_regex( From 11330f5121732f80e3d6ba1c34102955427eb04b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 19 Jul 2018 02:25:19 +0700 Subject: [PATCH 8/8] [facebook] Extract view count and update tests (closes #16942) --- youtube_dl/extractor/facebook.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8a9ed96c2..f78479b92 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -20,6 +20,7 @@ from ..utils import ( int_or_none, js_to_json, limit_length, + parse_count, sanitized_Request, try_get, urlencode_postdata, @@ -75,7 +76,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif Nawab Butt posted a video to his Timeline.', + 'title': 're:^Asif Nawab Butt posted a video', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, @@ -133,7 +134,7 @@ class FacebookIE(InfoExtractor): }, { # have 1080P, but only up to 720p in swf params 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '0d9813160b146b3bc8744e006027fcc6', + 'md5': '9571fae53d4165bbbadb17a94651dcdc', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -142,6 +143,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20161030', 'uploader': 'CNN', 'thumbnail': r're:^https?://.*', + 'view_count': int, }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -149,7 +151,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'md5:a7b86ca673f51800cd54687b7f4012fe', + 'title': 'md5:1db063d6a8c13faa8da727817339c857', 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', @@ -176,7 +178,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1396382447100162', 'ext': 'mp4', - 'title': 'md5:e2d2700afdf84e121f5d0f999bad13a3', + 'title': 'md5:19a428bbde91364e3de815383b54a235', 'timestamp': 1486035494, 'upload_date': '20170202', 'uploader': 'Elisabeth Ahtn', @@ -426,6 +428,10 @@ class FacebookIE(InfoExtractor): 'timestamp', default=None)) thumbnail = self._og_search_thumbnail(webpage) + view_count = parse_count(self._search_regex( + r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', + default=None)) + info_dict = { 'id': video_id, 'title': video_title, @@ -433,6 +439,7 @@ class FacebookIE(InfoExtractor): 'uploader': uploader, 'timestamp': timestamp, 'thumbnail': thumbnail, + 'view_count': view_count, } return webpage, info_dict